From a5f5f62e925e382294e2dc6aeff65eac473e5ef2 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 9 May 2025 01:00:43 +0000 Subject: [PATCH 01/26] [NVPTX] Add syncscope support for cmpxchg --- llvm/include/llvm/CodeGen/TargetLowering.h | 16 +- llvm/lib/CodeGen/AtomicExpandPass.cpp | 15 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +- llvm/lib/Target/ARM/ARMISelLowering.h | 10 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 13 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 12 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 52 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.h | 13 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 16526 ++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 16584 ++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 22252 ++++++++++++++-- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 20 +- llvm/test/CodeGen/NVPTX/cmpxchg.py | 13 +- .../NVPTX/distributed-shared-cluster.ll | 36 +- 18 files changed, 47703 insertions(+), 7899 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9c3cede359c15..d11e2ca22b189 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2325,13 +2325,15 @@ class LLVM_ABI TargetLoweringBase { /// standard ABI uses a fence before a seq_cst load instead of after a /// seq_cst store). /// @{ - virtual Instruction *emitLeadingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; - - virtual Instruction *emitTrailingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; + virtual Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; + + virtual Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3f3d5dc90711f..bc400b28d26af 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -314,6 +314,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; + SyncScope::ID SSID = SyncScope::System; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); @@ -336,13 +337,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); + SSID = CASI->getSyncScopeID(); CASI->setSuccessOrdering(CASOrdering); CASI->setFailureOrdering(CASOrdering); + // If CAS ordering is monotonic, then the operation will + // take default scope. Otherwise, it will retain its scope + if (CASOrdering != AtomicOrdering::Monotonic) + CASI->setSyncScopeID(SSID); } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering); + MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID); } } else if (I->hasAtomicStore() && TLI->shouldInsertTrailingFenceForAtomicStore(I)) { @@ -443,12 +449,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F, } bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, - AtomicOrdering Order) { + AtomicOrdering Order, + SyncScope::ID SSID) { ReplacementIRBuilder Builder(I, *DL); - auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID); - auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID); // We have a guard here because not every atomic operation generates a // trailing fence. if (TrailingFence) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 0a077b7b61437..6c4a480b5ca87 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2383,18 +2383,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isAcquireOrStronger(Ord)) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 05ca11cfac5cb..1a409c3165f49 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21190,7 +21190,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -21215,7 +21216,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 604910e04d4cc..79926386cde1e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -674,10 +674,12 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction *emitLeadingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + Instruction *emitTrailingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bb0aeb493ed48..d0df26087d2e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6266,7 +6266,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); @@ -6274,15 +6275,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated if (isReleaseOrStronger(Ord)) return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); + ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, + SSID) + : Builder.CreateFence(AtomicOrdering::Release, SSID); return nullptr; } Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { // Specialize for cmpxchg if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); @@ -6295,7 +6298,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || CASWidth < STI.getMinCmpXchgSizeInBits())) - return Builder.CreateFence(AtomicOrdering::Acquire); + return Builder.CreateFence(AtomicOrdering::Acquire, SSID); return nullptr; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 2477e1fb61595..d60d04e65c460 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -264,10 +264,14 @@ class NVPTXTargetLowering : public TargetLowering { AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d840324ce8238..cc0df4d3f0900 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -41,6 +41,27 @@ def AS_match { }]; } +multiclass nvvm_ternary_atomic_op_scoped { + defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val); + def NAME#_cta: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Block; + }]>; + def NAME#_cluster : PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Cluster; + }]>; + def NAME#_gpu: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Device; + }]>; + def NAME#_sys: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ + return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::System; + }]>; +} + + // A node that will be replaced with the current PTX version. class PTX { SDNodeXForm PTXVerXform = SDNodeXForm, preds>; } -multiclass F_ATOMIC_3_AS preds = []> { +multiclass F_ATOMIC_3_AS preds = []> { defvar frag_pat = (frag node:$a, node:$b, node:$c); - defm _G : F_ATOMIC_3, preds>; - defm _S : F_ATOMIC_3, preds>; - defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; - defm _GEN : F_ATOMIC_3, preds>; + defm _G : F_ATOMIC_3, preds>; + defm _S : F_ATOMIC_3, preds>; + defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; + defm _GEN : F_ATOMIC_3, preds>; } // atom_add @@ -1916,18 +1937,30 @@ foreach t = [I32RT, I64RT] in { foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size#_#order); + + // Instantiate scoped versions of the atomic compare and swap pattern + defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped; + + foreach scope = ["cta", "cluster", "gpu", "sys"] in { + defvar atomic_cmp_swap_pat_scoped = !cast("atomic_cmp_swap_i"#t.Size#_#order#_#scope); + + // Syncscope is only supported for SM70+ + defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope + : F_ATOMIC_3_AS, hasPTX<63>]>; + } + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. defm INT_PTX_ATOM_CAS_#t.Size#_#order - : F_ATOMIC_3_AS, hasPTX<63>]>; + : F_ATOMIC_3_AS, hasPTX<63>]>; defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old - : F_ATOMIC_3_AS; + : F_ATOMIC_3_AS; } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -1957,7 +1990,8 @@ multiclass ATOM3N_impl Preds> { defm "" : F_ATOMIC_3( "int_nvvm_atomic_" # OpStr diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index b96505816dee8..7e2bd684a3e06 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12820,7 +12820,8 @@ Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder, // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -12830,7 +12831,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 4c88bd372b106..3e99f8b8d21b6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -932,11 +932,14 @@ namespace llvm { Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; bool shouldInlineQuadwordAtomics() const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 35fbac04b3405..a08b4aac24e06 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23306,7 +23306,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); @@ -23322,7 +23323,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a1b283e35074a..05ea2e5759f80 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -232,10 +232,14 @@ class RISCVTargetLowering : public TargetLowering { // than this hook due to limitations in the interface here. bool shouldInsertFencesForAtomic(const Instruction *I) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 9f900c961d2ed..0281212659da0 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,20 +11,20 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,14 +41,14 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,21 +56,21 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -86,14 +86,14 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,21 +101,21 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -131,14 +131,14 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,21 +146,21 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -192,20 +191,20 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -238,21 +236,21 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -284,22 +281,21 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -331,22 +326,21 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -378,21 +371,20 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -425,20 +416,20 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -456,14 +447,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -471,21 +462,21 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -517,21 +508,21 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -563,21 +554,21 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -594,14 +585,14 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -609,20 +600,20 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -655,21 +646,21 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -701,22 +692,21 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -733,14 +723,14 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -748,22 +738,21 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -795,21 +784,20 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -842,21 +830,21 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -888,22 +877,22 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -934,22 +924,22 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -980,22 +971,22 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1012,14 +1003,14 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1027,21 +1018,21 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1074,22 +1065,22 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1121,22 +1112,22 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1153,14 +1144,14 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1168,22 +1159,22 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1215,21 +1206,21 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1262,21 +1253,20 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1294,14 +1284,14 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1309,22 +1299,21 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1356,22 +1345,21 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1403,22 +1391,21 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1435,14 +1422,14 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1450,21 +1437,20 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1497,22 +1483,21 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1544,22 +1529,21 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1576,14 +1560,14 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB33_1; ; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1591,22 +1575,21 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB34_1; ; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1638,21 +1621,20 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB35_1; ; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1685,21 +1667,20 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,14 +1698,14 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB36_1; ; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1732,22 +1713,21 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1779,22 +1759,21 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1826,22 +1805,21 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1858,14 +1836,14 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1873,21 +1851,20 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1920,22 +1897,21 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1967,22 +1943,21 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1999,14 +1974,14 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2014,22 +1989,21 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2061,21 +2035,20 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2092,37 +2065,39 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2137,36 +2112,39 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB45_1; ; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2181,36 +2159,39 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB46_1; ; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2225,36 +2206,39 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB47_1; ; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2270,36 +2254,38 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB48_1; ; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2314,37 +2300,39 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2359,38 +2347,39 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2406,37 +2395,38 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2451,38 +2441,39 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2497,37 +2488,39 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2542,37 +2535,38 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2587,37 +2581,38 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2632,37 +2627,38 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2677,37 +2673,38 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2722,37 +2719,38 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2767,38 +2765,38 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2813,38 +2811,38 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2859,38 +2857,38 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2905,38 +2903,38 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2951,37 +2949,39 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -2996,37 +2996,39 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3041,37 +3043,39 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3087,37 +3091,38 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3132,38 +3137,39 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3178,38 +3184,39 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3225,37 +3232,38 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB69_1; ; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3270,38 +3278,39 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB70_1; ; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3316,38 +3325,39 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB71_1; ; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3363,37 +3373,38 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3408,38 +3419,39 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3454,38 +3466,39 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3501,37 +3514,38 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3546,38 +3560,39 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3592,38 +3607,39 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3639,37 +3655,38 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3684,38 +3701,39 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3730,38 +3748,39 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3777,37 +3796,38 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3822,38 +3842,39 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3868,38 +3889,39 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3915,37 +3937,38 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB84_1; ; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -3960,38 +3983,39 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -4006,38 +4030,39 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -4053,37 +4078,38 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -4098,38 +4124,39 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; @@ -4144,1537 +4171,12862 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB90_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB90_1; +; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB91_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB91_1; +; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB92_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB92_1; +; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB93_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB93_1; +; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB94_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB94_1; +; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB95_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB95_1; +; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB96_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB96_1; +; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB97_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB97_1; +; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB98_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB98_1; +; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB99_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB99_1; +; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB100_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB100_1; +; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB101_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB101_1; +; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB102_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB102_1; +; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB103_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB103_1; +; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB104_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB104_1; +; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB105_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB105_1; +; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB106_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB106_1; +; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB107_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB107_1; +; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB108_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB108_1; +; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB109_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB109_1; +; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB110_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB110_1; +; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB111_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB111_1; +; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB112_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB112_1; +; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB113_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB113_1; +; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB114_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB114_1; +; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB115_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB115_1; +; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB116_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB116_1; +; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB117_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB117_1; +; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB118_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB118_1; +; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB119_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB119_1; +; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB120_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB120_1; +; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB121_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB121_1; +; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB122_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB122_1; +; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB123_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB123_1; +; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB124_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB124_1; +; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB125_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB125_1; +; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB126_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB126_1; +; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB127_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB127_1; +; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB128_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB128_1; +; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB129_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB129_1; +; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB130_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB130_1; +; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB131_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB131_1; +; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB132_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB132_1; +; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB133_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB133_1; +; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB134_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB134_1; +; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB135_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB135_1; +; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB136_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB136_1; +; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB137_1; +; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB227_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB230_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_sys( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_cta( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 28b258dc2a868..ddedc7ea36252 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,20 +11,20 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,14 +41,14 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,21 +56,21 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -86,14 +86,14 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,21 +101,21 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -131,14 +131,14 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,21 +146,21 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -192,20 +191,20 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -238,21 +236,21 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -284,22 +281,21 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -331,22 +326,21 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -378,21 +371,20 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -425,20 +416,20 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -456,14 +447,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -471,21 +462,21 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -517,21 +508,21 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -563,21 +554,21 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -594,14 +585,14 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -609,20 +600,20 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -655,21 +646,21 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -701,22 +692,21 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -733,14 +723,14 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -748,22 +738,21 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -795,21 +784,20 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -842,21 +830,21 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -888,22 +877,22 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -934,22 +924,22 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -980,22 +971,22 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1012,14 +1003,14 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1027,21 +1018,21 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1074,22 +1065,22 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1121,22 +1112,22 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1153,14 +1144,14 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1168,22 +1159,22 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1215,21 +1206,21 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1262,21 +1253,20 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1294,14 +1284,14 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1309,22 +1299,21 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1356,22 +1345,21 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1403,22 +1391,21 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1435,14 +1422,14 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1450,21 +1437,20 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1497,22 +1483,21 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1544,22 +1529,21 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1576,14 +1560,14 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB33_1; ; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1591,22 +1575,21 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB34_1; ; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1638,21 +1621,20 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB35_1; ; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1685,21 +1667,20 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,14 +1698,14 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB36_1; ; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1732,22 +1713,21 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1779,22 +1759,21 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1826,22 +1805,21 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1858,14 +1836,14 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1873,21 +1851,20 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1920,22 +1897,21 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1967,22 +1943,21 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1999,14 +1974,14 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2014,22 +1989,21 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2061,21 +2035,20 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2092,37 +2065,39 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2137,36 +2112,39 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB45_1; ; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2181,36 +2159,39 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB46_1; ; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2225,36 +2206,39 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB47_1; ; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2270,36 +2254,38 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB48_1; ; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2314,37 +2300,39 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2359,38 +2347,39 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2406,37 +2395,38 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2451,38 +2441,39 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2497,37 +2488,39 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2542,37 +2535,38 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2587,37 +2581,38 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2632,37 +2627,38 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2677,37 +2673,38 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2722,37 +2719,38 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2767,38 +2765,38 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2813,38 +2811,38 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2859,38 +2857,38 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2905,38 +2903,38 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2951,37 +2949,39 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -2996,37 +2996,39 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3041,37 +3043,39 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3087,37 +3091,38 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3132,38 +3137,39 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3178,38 +3184,39 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3225,37 +3232,38 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB69_1; ; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3270,38 +3278,39 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB70_1; ; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3316,38 +3325,39 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB71_1; ; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3363,37 +3373,38 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3408,38 +3419,39 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3454,38 +3466,39 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3501,37 +3514,38 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3546,38 +3560,39 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3592,38 +3607,39 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3639,37 +3655,38 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3684,38 +3701,39 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3730,38 +3748,39 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3777,37 +3796,38 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3822,38 +3842,39 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3868,38 +3889,39 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3915,37 +3937,38 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB84_1; ; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -3960,38 +3983,39 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -4006,38 +4030,39 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -4053,37 +4078,38 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -4098,38 +4124,39 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; @@ -4144,1537 +4171,12862 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB90_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB90_1; +; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB91_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB91_1; +; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB92_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB92_1; +; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB93_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB93_1; +; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB94_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB94_1; +; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB95_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB95_1; +; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB96_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB96_1; +; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB97_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB97_1; +; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic( +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB98_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB98_1; +; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB99_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB99_1; +; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB100_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB100_1; +; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB101_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB101_1; +; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB102_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB102_1; +; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB103_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB103_1; +; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB104_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB104_1; +; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB105_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB105_1; +; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB106_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB106_1; +; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB107_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB107_1; +; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB108_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB108_1; +; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB109_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB109_1; +; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB110_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB110_1; +; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB111_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB111_1; +; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB112_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB112_1; +; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB113_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB113_1; +; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB114_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB114_1; +; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB115_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB115_1; +; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB116_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB116_1; +; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB117_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB117_1; +; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB118_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB118_1; +; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB119_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB119_1; +; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB120_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB120_1; +; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB121_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB121_1; +; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB122_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB122_1; +; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB123_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB123_1; +; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB124_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB124_1; +; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB125_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB125_1; +; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB126_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB126_1; +; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB127_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB127_1; +; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB128_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB128_1; +; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB129_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB129_1; +; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB130_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB130_1; +; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB131_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB131_1; +; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB132_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB132_1; +; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB133_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB133_1; +; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB134_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB134_1; +; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB135_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB135_1; +; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB136_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB136_1; +; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB137_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB137_1; +; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB138_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB138_1; +; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB139_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB139_1; +; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB140_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB140_1; +; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB141_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB141_1; +; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB142_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB142_1; +; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB143_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB143_1; +; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB144_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB144_1; +; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB145_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB145_1; +; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB146_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB146_1; +; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB147_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB147_1; +; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB148_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB148_1; +; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB149_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB149_1; +; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB150_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB150_1; +; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB151_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB151_1; +; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB152_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB152_1; +; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB153_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB153_1; +; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB154_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB154_1; +; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB155_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB155_1; +; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB156_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB156_1; +; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB157_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB157_1; +; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB158_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB158_1; +; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB159_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB159_1; +; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB160_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB160_1; +; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB161_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB161_1; +; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB162_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB162_1; +; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB163_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB163_1; +; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB164_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB164_1; +; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB165_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB165_1; +; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB166_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB166_1; +; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB167_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB167_1; +; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB168_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB168_1; +; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB169_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB169_1; +; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB170_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB170_1; +; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB171_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB171_1; +; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB172_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB172_1; +; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB173_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB173_1; +; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB174_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB174_1; +; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB175_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB175_1; +; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB176_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB176_1; +; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB177_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB177_1; +; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB178_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB178_1; +; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB179_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB179_1; +; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB180_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB180_1; +; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB181_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB181_1; +; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB182_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB182_1; +; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB183_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB183_1; +; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB184_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB184_1; +; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB185_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB185_1; +; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB186_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB186_1; +; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB187_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB187_1; +; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB188_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB188_1; +; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB189_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB189_1; +; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB190_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB190_1; +; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB191_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB191_1; +; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB192_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB192_1; +; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB193_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB193_1; +; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB194_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB194_1; +; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB195_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB195_1; +; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB196_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB196_1; +; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB197_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB197_1; +; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB198_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB198_1; +; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB199_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB199_1; +; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB200_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB200_1; +; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB201_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB201_1; +; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB202_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB202_1; +; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB203_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB203_1; +; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB204_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB204_1; +; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB205_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB205_1; +; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB206_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB206_1; +; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB207_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB207_1; +; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB208_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB208_1; +; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB209_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB209_1; +; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB210_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB210_1; +; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB211_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB211_1; +; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB212_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB212_1; +; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB213_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB213_1; +; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB214_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB214_1; +; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB215_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB215_1; +; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB216_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB216_1; +; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB217_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB217_1; +; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB218_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB218_1; +; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB219_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB219_1; +; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB220_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB220_1; +; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB221_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB221_1; +; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB222_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB222_1; +; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB223_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB223_1; +; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB224_1; +; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB225_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB225_1; +; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB226_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB226_1; +; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB227_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB227_1; +; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB228_1; +; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB229_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB229_1; +; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB230_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB230_1; +; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB231_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB231_1; +; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB232_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB232_1; +; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB233_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB233_1; +; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB234_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB234_1; +; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB235_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB235_1; +; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB236_1; +; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB237_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB237_1; +; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB238_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB238_1; +; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB239_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB239_1; +; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB240_1; +; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB241_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB241_1; +; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB242_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB242_1; +; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB243_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB243_1; +; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB244_1; +; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB245_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB245_1; +; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB246_1; +; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB247_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB247_1; +; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB248_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB248_1; +; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB249_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB249_1; +; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB250_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB250_1; +; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB251_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB251_1; +; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB252_1; +; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB253_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB253_1; +; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB254_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB254_1; +; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB255_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB255_1; +; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB256_1; +; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB257_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB257_1; +; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB258_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB258_1; +; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB259_1; +; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB260_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB260_1; +; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB261_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB261_1; +; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB262_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB262_1; +; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB263_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB263_1; +; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB264_1; +; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB265_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB265_1; +; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB266_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB266_1; +; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB267_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB267_1; +; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB268_1; +; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB269_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB269_1; +; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared( +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic( +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global( +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared( +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_sys( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_cta( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_gpu( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global( +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic( +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global( +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared( +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic( +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared( +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic( +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global( +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared( +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 368fe3f036c9e..b73c848c5e680 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,20 +11,20 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,14 +41,14 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,21 +56,21 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -86,14 +86,14 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,21 +101,21 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -131,14 +131,14 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,20 +146,20 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -192,20 +191,20 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -238,21 +236,21 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -284,22 +281,21 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -331,21 +326,20 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -378,21 +371,20 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -425,21 +416,21 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -471,21 +461,21 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -517,20 +506,20 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -563,20 +551,20 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -594,14 +582,14 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -609,21 +597,21 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -639,15 +627,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared( +define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -655,21 +643,21 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -701,21 +689,20 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -748,21 +735,20 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -780,14 +766,14 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -795,22 +781,21 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -826,15 +811,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic( +define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -842,22 +827,21 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -888,21 +873,20 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -934,21 +919,20 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -980,22 +965,21 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1011,15 +995,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1027,22 +1011,21 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1074,21 +1057,20 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1121,21 +1103,21 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1153,14 +1135,14 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1168,22 +1150,22 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1199,15 +1181,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1215,22 +1197,22 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1262,21 +1244,21 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1309,21 +1291,21 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1341,14 +1323,14 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1356,22 +1338,22 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1387,15 +1369,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic( +define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1403,22 +1385,22 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1450,21 +1432,21 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1497,21 +1479,21 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1529,14 +1511,14 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1544,22 +1526,22 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1575,15 +1557,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1591,22 +1573,22 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1638,21 +1620,21 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1685,21 +1667,20 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1717,14 +1698,14 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB36_1; ; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1732,22 +1713,21 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB37_1; ; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared( +define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1779,22 +1759,21 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB38_1; ; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1826,21 +1805,20 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB39_1; ; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1873,21 +1851,20 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1905,14 +1882,14 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB40_1; ; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1920,22 +1897,21 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic( +define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1967,22 +1943,21 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2014,21 +1989,20 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2061,21 +2035,20 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2093,36 +2066,37 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2137,36 +2111,38 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global( +define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2181,36 +2157,38 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + ret i8 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2225,36 +2203,38 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + ret i8 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2270,36 +2250,37 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2314,37 +2295,38 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared( +define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2359,38 +2341,38 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2405,38 +2387,38 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2452,37 +2434,37 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2497,37 +2479,38 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic( +define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2542,37 +2525,38 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2587,37 +2571,38 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2633,36 +2618,37 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: @%p2 bra $L__BB56_1; ; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2677,37 +2663,38 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB57_1; ; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global( +define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2722,37 +2709,38 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB58_1; ; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + ret i8 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2767,38 +2755,39 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB59_1; ; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + ret i8 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2814,37 +2803,38 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2859,38 +2849,39 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2905,38 +2896,39 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2951,37 +2943,39 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -2996,37 +2990,39 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3041,37 +3037,39 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic( +define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3086,38 +3084,39 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3132,38 +3131,39 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3179,37 +3179,38 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3224,38 +3225,39 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global( +define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3270,38 +3272,39 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + ret i8 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3316,38 +3319,39 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + ret i8 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3362,38 +3366,38 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3408,38 +3412,38 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared( +define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3454,38 +3458,38 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3500,38 +3504,38 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3546,38 +3550,38 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3592,38 +3596,38 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic( +define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3638,38 +3642,38 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3684,38 +3688,38 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3730,38 +3734,38 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3776,38 +3780,38 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global( +define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3822,38 +3826,38 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + ret i8 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3868,38 +3872,38 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + ret i8 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3915,37 +3919,38 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -3960,38 +3965,39 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared( +define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -4006,38 +4012,39 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -4052,38 +4059,39 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -4099,37 +4107,38 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; @@ -4144,1537 +4153,18556 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic( +define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB90_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB90_1; +; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB91_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB91_1; +; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB92_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB92_1; +; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB93_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB93_1; +; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB94_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB94_1; +; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + ret i8 %new +} + +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB95_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB95_1; +; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB96_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB96_1; +; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB97_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB97_1; +; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB98_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB98_1; +; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB99_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB99_1; +; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB100_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB100_1; +; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB101_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB101_1; +; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB102_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB102_1; +; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB103_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB103_1; +; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB104_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB104_1; +; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB105_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB105_1; +; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB106_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB106_1; +; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + ret i8 %new +} + +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB107_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB107_1; +; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB108_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB108_1; +; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB109_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB109_1; +; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB110_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB110_1; +; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB111_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB111_1; +; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB112_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB112_1; +; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB113_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB113_1; +; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB114_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB114_1; +; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB115_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB115_1; +; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB116_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB116_1; +; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB117_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB117_1; +; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB118_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB118_1; +; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB119_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB119_1; +; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB120_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB120_1; +; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB121_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB121_1; +; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB122_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB122_1; +; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB123_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB123_1; +; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB124_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB124_1; +; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB125_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB125_1; +; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB126_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB126_1; +; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB127_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB127_1; +; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB128_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB128_1; +; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB129_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB129_1; +; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB130_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB130_1; +; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB131_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB131_1; +; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB132_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB132_1; +; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB133_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB133_1; +; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB134_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB134_1; +; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB135_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB135_1; +; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB136_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB136_1; +; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB137_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB137_1; +; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB138_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB138_1; +; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB139_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB139_1; +; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB140_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB140_1; +; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB141_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB141_1; +; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB142_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB142_1; +; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB143_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB143_1; +; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB144_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB144_1; +; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB145_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB145_1; +; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB146_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB146_1; +; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB147_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB147_1; +; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB148_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB148_1; +; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB149_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB149_1; +; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB150_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB150_1; +; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB151_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB151_1; +; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB152_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB152_1; +; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB153_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB153_1; +; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB154_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB154_1; +; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB155_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB155_1; +; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB156_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB156_1; +; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB157_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB157_1; +; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB158_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB158_1; +; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB159_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB159_1; +; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB160_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB160_1; +; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB161_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB161_1; +; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB162_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB162_1; +; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB163_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB163_1; +; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB164_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB164_1; +; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB165_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB165_1; +; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB166_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB166_1; +; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB167_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB167_1; +; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB168_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB168_1; +; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB169_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB169_1; +; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB170_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB170_1; +; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB171_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB171_1; +; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB172_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB172_1; +; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB173_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB173_1; +; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB174_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB174_1; +; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB175_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB175_1; +; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB176_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB176_1; +; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB177_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB177_1; +; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB178_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB178_1; +; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB179_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB179_1; +; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB180_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB180_1; +; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB181_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB181_1; +; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB182_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB182_1; +; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB183_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB183_1; +; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB184_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB184_1; +; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB185_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB185_1; +; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB186_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB186_1; +; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB187_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB187_1; +; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB188_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB188_1; +; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB189_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB189_1; +; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB190_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB190_1; +; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB191_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB191_1; +; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB192_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB192_1; +; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB193_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB193_1; +; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB194_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB194_1; +; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB195_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB195_1; +; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB196_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB196_1; +; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB197_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB197_1; +; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB198_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB198_1; +; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB199_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB199_1; +; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB200_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB200_1; +; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB201_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB201_1; +; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB202_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB202_1; +; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB203_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB203_1; +; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB204_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB204_1; +; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB205_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB205_1; +; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB206_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB206_1; +; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB207_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB207_1; +; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB208_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB208_1; +; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB209_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB209_1; +; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB210_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB210_1; +; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB211_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB211_1; +; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB212_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB212_1; +; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB213_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB213_1; +; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB214_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB214_1; +; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB215_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB215_1; +; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB216_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB216_1; +; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB217_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB217_1; +; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB218_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB218_1; +; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB219_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB219_1; +; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB220_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB220_1; +; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB221_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB221_1; +; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB222_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB222_1; +; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB223_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB223_1; +; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB224_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB224_1; +; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB225_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB225_1; +; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB226_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB226_1; +; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB227_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB227_1; +; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB228_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB228_1; +; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB229_1; +; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB230_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB230_1; +; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB231_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB231_1; +; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB232_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB232_1; +; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB233_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB233_1; +; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB234_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB234_1; +; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB235_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB235_1; +; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB236_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB236_1; +; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB237_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB237_1; +; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB238_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB238_1; +; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB239_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB239_1; +; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB240_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB240_1; +; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB241_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB241_1; +; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB242_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB242_1; +; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB243_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB243_1; +; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB244_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB244_1; +; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB245_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB245_1; +; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB246_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB246_1; +; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB247_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB247_1; +; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB248_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB248_1; +; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB249_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB249_1; +; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB250_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB250_1; +; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB251_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB251_1; +; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB252_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB252_1; +; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB253_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB253_1; +; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB254_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB254_1; +; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB255_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB255_1; +; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB256_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB256_1; +; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB257_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB257_1; +; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB258_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB258_1; +; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB259_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB259_1; +; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB260_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB260_1; +; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB261_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB261_1; +; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB262_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB262_1; +; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB263_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB263_1; +; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB264_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB264_1; +; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB265_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB265_1; +; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB266_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB266_1; +; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB267_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB267_1; +; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB268_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB268_1; +; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB269_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB269_1; +; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB270_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB270_1; +; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB271_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB271_1; +; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB272_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB272_1; +; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB273_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB273_1; +; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB274_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB274_1; +; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB275_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB275_1; +; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB276_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB276_1; +; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB277_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB277_1; +; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB278_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB278_1; +; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB279_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB279_1; +; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB280_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB280_1; +; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB281_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB281_1; +; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB282_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB282_1; +; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB283_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB283_1; +; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB284_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB284_1; +; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB285_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB285_1; +; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB286_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB286_1; +; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB287_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB287_1; +; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB288_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB288_1; +; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB289_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB289_1; +; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB290_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB290_1; +; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB291_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB291_1; +; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB292_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB292_1; +; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB293_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB293_1; +; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB294_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB294_1; +; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB295_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB295_1; +; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB296_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB296_1; +; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB297_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB297_1; +; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB298_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB298_1; +; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB299_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB299_1; +; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB300_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB300_1; +; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB301_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB301_1; +; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB302_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB302_1; +; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB303_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB303_1; +; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB304_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB304_1; +; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB305_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB305_1; +; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB306_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB306_1; +; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB307_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB307_1; +; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB308_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB308_1; +; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB309_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB309_1; +; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB310_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB310_1; +; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB311_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB311_1; +; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB312_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB312_1; +; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB313_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB313_1; +; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB314_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB314_1; +; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB315_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB315_1; +; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB316_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB316_1; +; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB317_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB317_1; +; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB318_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB318_1; +; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB319_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB319_1; +; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB320_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB320_1; +; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB321_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB321_1; +; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB322_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB322_1; +; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB323_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB323_1; +; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB324_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB324_1; +; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB325_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB325_1; +; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB326_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB326_1; +; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB327_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB327_1; +; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB328_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB328_1; +; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB329_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB329_1; +; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB330_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB330_1; +; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB331_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB331_1; +; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB332_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB332_1; +; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB333_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB333_1; +; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB334_1; +; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB335_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB335_1; +; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB336_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB336_1; +; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB337_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB337_1; +; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB338_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB338_1; +; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB339_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB339_1; +; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB340_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB340_1; +; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB341_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB341_1; +; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB342_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB342_1; +; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB343_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB343_1; +; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB344_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB344_1; +; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB345_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB345_1; +; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB346_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB346_1; +; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB347_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB347_1; +; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB348_1; +; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB349_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB349_1; +; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB350_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB350_1; +; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB351_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB351_1; +; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB352_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB352_1; +; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB353_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB353_1; +; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB354_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB354_1; +; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB355_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB355_1; +; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB356_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB356_1; +; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB357_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB357_1; +; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB358_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB358_1; +; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB359_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB359_1; +; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst + ret i32 %new +} + +define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic + ret i32 %new +} + +define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst + ret i32 %new +} + +define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire + ret i32 %new +} + +define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst + ret i32 %new +} + +define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst + ret i32 %new +} + +define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst + ret i32 %new +} + +define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst ret i32 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic( +define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst + ret i64 %new +} + +define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic + ret i64 %new +} + +define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire + ret i64 %new +} + +define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global( +define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared( +define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global( +define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared( +define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cta( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire + ret i64 %new +} + +define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire + ret i64 %new +} + +define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic( +define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global( +define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared( +define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic( +define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared( +define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_gpu( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new +} + +define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_sys( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic( +define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global( +define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst + ret i64 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared( +define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst + ret i64 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic( +define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global( +define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic( +define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global( +define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic( +define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global( +define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared( +define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic + ret i64 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global( +define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic( +define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global( +define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic( +define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global( +define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared( +define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic( +define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global( +define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared( +define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared_gpu( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic( +define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global( +define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst + ret i64 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared( +define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster( ; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst + ret i64 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic( +define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global( +define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global( +define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global( +define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared( +define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic( +define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global( +define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic( +define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global( +define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic( +define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global( +define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic ret i64 %new } -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared( +define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global( +define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic( +define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global( +define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic( +define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global( +define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared( +define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic( +define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global( +define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared( +define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic( +define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global( +define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic( +define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global( +define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global( +define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared( +define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 25b4c74086dc1..547b13136ff93 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; -; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i32( @@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i32( @@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i32( @@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; ; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; -; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i32( @@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; ; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; -; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i32( @@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; -; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i64( @@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i64( @@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; -; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i64( @@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; -; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i64( @@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; ; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; -; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i64( diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index ae7450015ecd2..277704bd9d5a5 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -5,8 +5,8 @@ from itertools import product cmpxchg_func = Template( - """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { - %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure + """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure ret i$size %new } """ @@ -38,9 +38,12 @@ for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + for size, success, failure, addrspace, llvm_scope in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES ): + # cluster ordering is supported from SM90 onwards + if sm != 90 and llvm_scope == "cluster": + continue if addrspace == 0: addrspace_cast = "" else: @@ -52,6 +55,8 @@ size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast, + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, ) diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index a1020e68e1bae..70330d322decf 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -171,30 +171,30 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; ; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; ; CHECK-NEXT: and.b32 %r34, %r33, 3; From 15c4fa407b76a9e3d3b03f23690cd0cd1bbe6834 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Wed, 21 May 2025 19:45:02 +0000 Subject: [PATCH 02/26] fix build and tests --- llvm/lib/Target/ARM/ARMISelLowering.h | 4 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 4994 +++++++++++------------ 2 files changed, 2499 insertions(+), 2499 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 79926386cde1e..c71b430b71874 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -676,10 +676,10 @@ class VectorType; Instruction *emitLeadingFence( IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + SyncScope::ID SSID = SyncScope::System) const override; Instruction *emitTrailingFence( IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + SyncScope::ID SSID = SyncScope::System) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index b73c848c5e680..68658255ad5af 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -203,7 +203,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -338,7 +338,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -383,7 +383,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -461,8 +461,8 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -473,9 +473,9 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -506,8 +506,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -518,7 +518,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -551,8 +551,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -563,7 +563,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -597,8 +597,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -609,9 +609,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -643,8 +643,8 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -655,9 +655,9 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -689,8 +689,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -701,7 +701,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -735,8 +735,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -747,7 +747,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -781,8 +781,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -793,9 +793,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -827,8 +827,8 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -839,9 +839,9 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -873,8 +873,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -885,7 +885,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -919,8 +919,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -931,7 +931,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -965,8 +965,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -977,9 +977,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1011,8 +1011,8 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1023,9 +1023,9 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1057,8 +1057,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1069,7 +1069,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1103,8 +1103,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1116,7 +1116,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1150,8 +1150,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1163,9 +1163,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1197,8 +1197,8 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1210,9 +1210,9 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1244,8 +1244,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1257,7 +1257,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1291,8 +1291,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1304,7 +1304,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1338,8 +1338,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1351,9 +1351,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1385,8 +1385,8 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1398,9 +1398,9 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1432,8 +1432,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1445,7 +1445,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1479,8 +1479,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1492,7 +1492,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1526,8 +1526,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1539,9 +1539,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1573,8 +1573,8 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1586,9 +1586,9 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1620,8 +1620,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1633,7 +1633,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1667,8 +1667,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1679,7 +1679,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1713,8 +1713,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1759,8 +1759,8 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1805,8 +1805,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1817,7 +1817,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1851,8 +1851,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1863,7 +1863,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -1897,8 +1897,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1943,8 +1943,8 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1989,8 +1989,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2001,7 +2001,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2035,8 +2035,8 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2047,7 +2047,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2081,8 +2081,8 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2093,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2127,8 +2127,8 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2139,9 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2173,8 +2173,8 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2185,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2219,8 +2219,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2231,9 +2231,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2265,8 +2265,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2277,9 +2277,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2311,8 +2311,8 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2323,9 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2357,8 +2357,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2369,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2403,8 +2403,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2415,9 +2415,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2449,8 +2449,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2461,9 +2461,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2495,8 +2495,8 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2507,9 +2507,9 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2541,8 +2541,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2553,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2587,8 +2587,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2599,9 +2599,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2633,8 +2633,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2645,9 +2645,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2679,8 +2679,8 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2691,9 +2691,9 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2725,8 +2725,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2737,9 +2737,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2771,8 +2771,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2784,9 +2784,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2818,8 +2818,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2831,9 +2831,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2865,8 +2865,8 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2878,9 +2878,9 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2912,8 +2912,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2925,9 +2925,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2959,8 +2959,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2972,9 +2972,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3006,8 +3006,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3019,9 +3019,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3053,8 +3053,8 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3066,9 +3066,9 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3100,8 +3100,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3113,9 +3113,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3147,8 +3147,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3160,9 +3160,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3194,8 +3194,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3207,9 +3207,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3241,8 +3241,8 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3254,9 +3254,9 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3288,8 +3288,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3301,9 +3301,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3335,8 +3335,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3348,9 +3348,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3381,8 +3381,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3394,9 +3394,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3427,8 +3427,8 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3440,9 +3440,9 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3473,8 +3473,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3486,9 +3486,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3519,8 +3519,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3532,9 +3532,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3565,8 +3565,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3578,9 +3578,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3611,8 +3611,8 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3624,9 +3624,9 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3657,8 +3657,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3670,9 +3670,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3703,8 +3703,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3716,9 +3716,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3749,8 +3749,8 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3762,9 +3762,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3795,8 +3795,8 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3808,9 +3808,9 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3841,8 +3841,8 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3854,9 +3854,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3887,8 +3887,8 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3900,9 +3900,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3934,8 +3934,8 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3947,9 +3947,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3981,8 +3981,8 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3994,9 +3994,9 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4028,8 +4028,8 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4041,9 +4041,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4075,8 +4075,8 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4088,9 +4088,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4122,8 +4122,8 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4135,9 +4135,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4169,8 +4169,8 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4182,9 +4182,9 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4216,8 +4216,8 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4229,9 +4229,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4263,8 +4263,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4276,9 +4276,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4310,8 +4310,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4323,9 +4323,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4357,8 +4357,8 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4370,9 +4370,9 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4404,8 +4404,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4417,9 +4417,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4451,8 +4451,8 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4464,9 +4464,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4498,8 +4498,8 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4511,9 +4511,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4545,8 +4545,8 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4558,9 +4558,9 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4592,8 +4592,8 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4605,9 +4605,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4639,8 +4639,8 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4652,9 +4652,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4686,8 +4686,8 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4699,9 +4699,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4733,8 +4733,8 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4746,9 +4746,9 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4780,8 +4780,8 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4793,9 +4793,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4827,8 +4827,8 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4840,9 +4840,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4874,8 +4874,8 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4887,9 +4887,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4921,8 +4921,8 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4934,9 +4934,9 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4968,8 +4968,8 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4981,9 +4981,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5015,8 +5015,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5028,9 +5028,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5062,8 +5062,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5075,9 +5075,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5109,8 +5109,8 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5122,9 +5122,9 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5156,8 +5156,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5169,9 +5169,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5203,8 +5203,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5216,9 +5216,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5250,8 +5250,8 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5263,9 +5263,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5297,8 +5297,8 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5310,9 +5310,9 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5344,8 +5344,8 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5357,9 +5357,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5391,8 +5391,8 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5404,9 +5404,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5438,8 +5438,8 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5451,9 +5451,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5485,8 +5485,8 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5498,9 +5498,9 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5532,8 +5532,8 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5545,9 +5545,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5579,8 +5579,8 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5592,9 +5592,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5626,8 +5626,8 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5639,9 +5639,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5673,8 +5673,8 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5686,9 +5686,9 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5720,8 +5720,8 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5733,9 +5733,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5767,8 +5767,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5780,9 +5780,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5814,8 +5814,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5827,9 +5827,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5861,8 +5861,8 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5874,9 +5874,9 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5908,8 +5908,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5921,9 +5921,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5955,8 +5955,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5968,9 +5968,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6002,8 +6002,8 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6015,9 +6015,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6049,8 +6049,8 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6062,9 +6062,9 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6096,8 +6096,8 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6109,9 +6109,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6143,8 +6143,8 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6156,9 +6156,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6190,8 +6190,8 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6203,9 +6203,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6237,8 +6237,8 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6250,9 +6250,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6284,8 +6284,8 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6297,9 +6297,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6331,8 +6331,8 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6344,9 +6344,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6378,8 +6378,8 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6391,9 +6391,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6425,8 +6425,8 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6438,9 +6438,9 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6472,8 +6472,8 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6485,9 +6485,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6519,8 +6519,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6532,9 +6532,9 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6566,8 +6566,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6579,9 +6579,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6613,8 +6613,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6626,9 +6626,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6660,8 +6660,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6673,9 +6673,9 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6707,8 +6707,8 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6720,9 +6720,9 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6754,8 +6754,8 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6767,9 +6767,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6801,8 +6801,8 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6814,9 +6814,9 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6848,8 +6848,8 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6861,9 +6861,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6895,8 +6895,8 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6908,9 +6908,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6942,8 +6942,8 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6955,9 +6955,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6989,8 +6989,8 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7002,9 +7002,9 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7036,8 +7036,8 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7049,9 +7049,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7083,8 +7083,8 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7096,9 +7096,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7130,8 +7130,8 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7143,9 +7143,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7177,8 +7177,8 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7190,9 +7190,9 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7224,8 +7224,8 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7237,9 +7237,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7271,8 +7271,8 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7284,9 +7284,9 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7318,8 +7318,8 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7331,9 +7331,9 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7365,8 +7365,8 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7378,9 +7378,9 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7412,8 +7412,8 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7425,9 +7425,9 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7459,8 +7459,8 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7472,9 +7472,9 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7506,8 +7506,8 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7519,9 +7519,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7553,8 +7553,8 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7566,9 +7566,9 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7600,8 +7600,8 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7613,9 +7613,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7647,8 +7647,8 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7660,9 +7660,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7694,8 +7694,8 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7707,9 +7707,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7741,8 +7741,8 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7754,9 +7754,9 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7788,8 +7788,8 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7801,9 +7801,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7835,8 +7835,8 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7848,9 +7848,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7882,8 +7882,8 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7895,9 +7895,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7929,8 +7929,8 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7942,9 +7942,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7976,8 +7976,8 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7989,9 +7989,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8023,8 +8023,8 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8036,9 +8036,9 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8070,8 +8070,8 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8083,9 +8083,9 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8117,8 +8117,8 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8130,9 +8130,9 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8164,8 +8164,8 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8177,9 +8177,9 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8211,8 +8211,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8224,9 +8224,9 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8258,8 +8258,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8271,9 +8271,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8305,8 +8305,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8318,9 +8318,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8352,8 +8352,8 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8365,9 +8365,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8399,10 +8399,10 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8412,7 +8412,7 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8443,10 +8443,10 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8456,7 +8456,7 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8487,10 +8487,10 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8500,7 +8500,7 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8531,10 +8531,10 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8544,7 +8544,7 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8575,10 +8575,10 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8588,7 +8588,7 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8619,10 +8619,10 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8632,7 +8632,7 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8663,10 +8663,10 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8676,7 +8676,7 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8707,10 +8707,10 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8720,7 +8720,7 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8751,10 +8751,10 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8764,7 +8764,7 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8795,10 +8795,10 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8808,7 +8808,7 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8839,10 +8839,10 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8852,7 +8852,7 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8883,10 +8883,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8896,7 +8896,7 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8927,10 +8927,10 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8940,7 +8940,7 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8972,10 +8972,10 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8985,7 +8985,7 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9017,10 +9017,10 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9030,7 +9030,7 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9062,10 +9062,10 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9075,7 +9075,7 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9107,10 +9107,10 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9120,7 +9120,7 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9152,10 +9152,10 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9165,7 +9165,7 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9197,10 +9197,10 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9210,7 +9210,7 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9242,10 +9242,10 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9255,7 +9255,7 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9287,10 +9287,10 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9300,7 +9300,7 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9332,10 +9332,10 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9345,7 +9345,7 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9377,10 +9377,10 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9390,7 +9390,7 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9422,10 +9422,10 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9435,7 +9435,7 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9467,10 +9467,10 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9481,7 +9481,7 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9513,10 +9513,10 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9527,7 +9527,7 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9559,10 +9559,10 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9573,7 +9573,7 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9605,10 +9605,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9619,7 +9619,7 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9651,10 +9651,10 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9665,7 +9665,7 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9697,10 +9697,10 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9711,7 +9711,7 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9743,10 +9743,10 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9757,7 +9757,7 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9789,10 +9789,10 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9803,7 +9803,7 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9835,10 +9835,10 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9849,7 +9849,7 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9881,10 +9881,10 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9895,7 +9895,7 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9927,10 +9927,10 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9941,7 +9941,7 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9973,10 +9973,10 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9987,7 +9987,7 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10019,10 +10019,10 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10032,7 +10032,7 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10064,10 +10064,10 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10077,7 +10077,7 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10109,10 +10109,10 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10122,7 +10122,7 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10154,10 +10154,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10167,7 +10167,7 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10199,10 +10199,10 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10212,7 +10212,7 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10244,10 +10244,10 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10257,7 +10257,7 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10289,10 +10289,10 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10302,7 +10302,7 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10334,10 +10334,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10347,7 +10347,7 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10379,10 +10379,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10392,7 +10392,7 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10424,10 +10424,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10437,7 +10437,7 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10469,10 +10469,10 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10482,7 +10482,7 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10514,10 +10514,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10527,7 +10527,7 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10559,10 +10559,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10572,7 +10572,7 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10604,10 +10604,10 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10617,7 +10617,7 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10649,10 +10649,10 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10662,7 +10662,7 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10694,10 +10694,10 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10707,7 +10707,7 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10739,10 +10739,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10752,7 +10752,7 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10784,10 +10784,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10797,7 +10797,7 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10829,10 +10829,10 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10842,7 +10842,7 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10874,10 +10874,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10887,7 +10887,7 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10919,10 +10919,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10932,7 +10932,7 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10964,10 +10964,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10977,7 +10977,7 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11009,10 +11009,10 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11022,7 +11022,7 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11054,10 +11054,10 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11067,7 +11067,7 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11099,10 +11099,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11113,7 +11113,7 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11145,10 +11145,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11159,7 +11159,7 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11191,10 +11191,10 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11205,7 +11205,7 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11237,10 +11237,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11251,7 +11251,7 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11283,10 +11283,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11297,7 +11297,7 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11329,10 +11329,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11343,7 +11343,7 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11375,10 +11375,10 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11389,7 +11389,7 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11421,10 +11421,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11435,7 +11435,7 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11467,10 +11467,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11481,7 +11481,7 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11513,10 +11513,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11527,7 +11527,7 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11559,10 +11559,10 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11573,7 +11573,7 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11605,10 +11605,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11619,7 +11619,7 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11651,10 +11651,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11665,7 +11665,7 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11696,10 +11696,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11710,7 +11710,7 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11741,10 +11741,10 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11755,7 +11755,7 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11786,10 +11786,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11800,7 +11800,7 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11831,10 +11831,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11845,7 +11845,7 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11876,10 +11876,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11890,7 +11890,7 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11921,10 +11921,10 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11935,7 +11935,7 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11966,10 +11966,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11980,7 +11980,7 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12011,10 +12011,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12025,7 +12025,7 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12056,10 +12056,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12070,7 +12070,7 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12101,10 +12101,10 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12115,7 +12115,7 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12146,10 +12146,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12160,7 +12160,7 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12191,10 +12191,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12205,7 +12205,7 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12237,10 +12237,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12251,7 +12251,7 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12283,10 +12283,10 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12297,7 +12297,7 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12329,10 +12329,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12343,7 +12343,7 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12375,10 +12375,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12389,7 +12389,7 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12421,10 +12421,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12435,7 +12435,7 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12467,10 +12467,10 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12481,7 +12481,7 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12513,10 +12513,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12527,7 +12527,7 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12559,10 +12559,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12573,7 +12573,7 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12605,10 +12605,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12619,7 +12619,7 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12651,10 +12651,10 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12665,7 +12665,7 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12697,10 +12697,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12711,7 +12711,7 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12743,10 +12743,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12757,7 +12757,7 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12789,10 +12789,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12803,7 +12803,7 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12835,10 +12835,10 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12849,7 +12849,7 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12881,10 +12881,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12895,7 +12895,7 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12927,10 +12927,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12941,7 +12941,7 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12973,10 +12973,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12987,7 +12987,7 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13019,10 +13019,10 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13033,7 +13033,7 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13065,10 +13065,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13079,7 +13079,7 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13111,10 +13111,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13125,7 +13125,7 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13157,10 +13157,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13171,7 +13171,7 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13203,10 +13203,10 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13217,7 +13217,7 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13249,10 +13249,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13263,7 +13263,7 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13295,10 +13295,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13309,7 +13309,7 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13341,10 +13341,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13355,7 +13355,7 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13387,10 +13387,10 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13401,7 +13401,7 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13433,10 +13433,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13447,7 +13447,7 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13479,10 +13479,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13493,7 +13493,7 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13525,10 +13525,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13539,7 +13539,7 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13571,10 +13571,10 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13585,7 +13585,7 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13617,10 +13617,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13631,7 +13631,7 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13663,10 +13663,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13677,7 +13677,7 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13709,10 +13709,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13723,7 +13723,7 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13755,10 +13755,10 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13769,7 +13769,7 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13801,10 +13801,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13815,7 +13815,7 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13847,10 +13847,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13861,7 +13861,7 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13893,10 +13893,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13907,7 +13907,7 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13939,10 +13939,10 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13953,7 +13953,7 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13985,10 +13985,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13999,7 +13999,7 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14031,10 +14031,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14045,7 +14045,7 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14077,10 +14077,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14091,7 +14091,7 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14123,10 +14123,10 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14137,7 +14137,7 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14169,10 +14169,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14183,7 +14183,7 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14215,10 +14215,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14229,7 +14229,7 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14261,10 +14261,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14275,7 +14275,7 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14307,10 +14307,10 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14321,7 +14321,7 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14353,10 +14353,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14367,7 +14367,7 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14399,10 +14399,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14413,7 +14413,7 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14445,10 +14445,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14459,7 +14459,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14491,10 +14491,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14505,7 +14505,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14537,10 +14537,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14551,7 +14551,7 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14583,10 +14583,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14597,7 +14597,7 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14629,10 +14629,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14643,7 +14643,7 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14675,10 +14675,10 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14689,7 +14689,7 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14721,10 +14721,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14735,7 +14735,7 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14767,10 +14767,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14781,7 +14781,7 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14813,10 +14813,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14827,7 +14827,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14859,10 +14859,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14873,7 +14873,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14905,10 +14905,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14919,7 +14919,7 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14951,10 +14951,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14965,7 +14965,7 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14997,10 +14997,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15011,7 +15011,7 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15043,10 +15043,10 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15057,7 +15057,7 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15089,10 +15089,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15103,7 +15103,7 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15135,10 +15135,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15149,7 +15149,7 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15181,10 +15181,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15195,7 +15195,7 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15227,10 +15227,10 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15241,7 +15241,7 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15273,10 +15273,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15287,7 +15287,7 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15319,10 +15319,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15333,7 +15333,7 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15365,10 +15365,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15379,7 +15379,7 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15411,10 +15411,10 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15425,7 +15425,7 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15457,10 +15457,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15471,7 +15471,7 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15503,10 +15503,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15517,7 +15517,7 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15549,10 +15549,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15563,7 +15563,7 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15595,10 +15595,10 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15609,7 +15609,7 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15641,10 +15641,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15655,7 +15655,7 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15687,10 +15687,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15701,7 +15701,7 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15733,10 +15733,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15747,7 +15747,7 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15779,10 +15779,10 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15793,7 +15793,7 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15825,10 +15825,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15839,7 +15839,7 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15871,10 +15871,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15885,7 +15885,7 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15917,10 +15917,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15931,7 +15931,7 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15963,10 +15963,10 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15977,7 +15977,7 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16009,10 +16009,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16023,7 +16023,7 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16055,10 +16055,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16069,7 +16069,7 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16101,10 +16101,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16115,7 +16115,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16147,10 +16147,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16161,7 +16161,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16193,10 +16193,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16207,7 +16207,7 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16239,10 +16239,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16253,7 +16253,7 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16285,10 +16285,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16299,7 +16299,7 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16331,10 +16331,10 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16345,7 +16345,7 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16377,10 +16377,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16391,7 +16391,7 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16423,10 +16423,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16437,7 +16437,7 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16469,10 +16469,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16483,7 +16483,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16515,10 +16515,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16529,7 +16529,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16561,10 +16561,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16575,7 +16575,7 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16605,9 +16605,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16622,9 +16622,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16639,9 +16639,9 @@ define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16656,9 +16656,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16673,9 +16673,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16690,9 +16690,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16707,9 +16707,9 @@ define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16724,9 +16724,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16741,9 +16741,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16758,9 +16758,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16775,9 +16775,9 @@ define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16792,9 +16792,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16809,9 +16809,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16826,9 +16826,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16843,9 +16843,9 @@ define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16860,9 +16860,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16877,9 +16877,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16894,9 +16894,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16911,9 +16911,9 @@ define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16928,9 +16928,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16945,9 +16945,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16962,9 +16962,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16979,9 +16979,9 @@ define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16996,9 +16996,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17013,10 +17013,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17031,10 +17031,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17049,10 +17049,10 @@ define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17067,10 +17067,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17085,10 +17085,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17103,10 +17103,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17121,10 +17121,10 @@ define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17139,10 +17139,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17157,10 +17157,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17175,10 +17175,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17193,10 +17193,10 @@ define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17211,10 +17211,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17229,9 +17229,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17246,9 +17246,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17263,9 +17263,9 @@ define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17280,9 +17280,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17297,9 +17297,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17314,9 +17314,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17331,9 +17331,9 @@ define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17348,9 +17348,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17365,9 +17365,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17382,9 +17382,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17399,9 +17399,9 @@ define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17416,9 +17416,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17433,9 +17433,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17450,9 +17450,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17467,9 +17467,9 @@ define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17484,9 +17484,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17501,9 +17501,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17518,9 +17518,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17535,9 +17535,9 @@ define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17552,9 +17552,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17569,9 +17569,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17586,9 +17586,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17603,9 +17603,9 @@ define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17620,9 +17620,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17637,10 +17637,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17655,10 +17655,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17673,10 +17673,10 @@ define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17691,10 +17691,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17709,10 +17709,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17727,10 +17727,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17745,10 +17745,10 @@ define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17763,10 +17763,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17781,10 +17781,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17799,10 +17799,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17817,10 +17817,10 @@ define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17835,10 +17835,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17853,9 +17853,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17870,9 +17870,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17887,9 +17887,9 @@ define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17904,9 +17904,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17921,9 +17921,9 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17938,9 +17938,9 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17955,9 +17955,9 @@ define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17972,9 +17972,9 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17989,9 +17989,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18006,9 +18006,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18023,9 +18023,9 @@ define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18040,9 +18040,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18057,9 +18057,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18074,9 +18074,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18091,9 +18091,9 @@ define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18108,9 +18108,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18125,9 +18125,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18142,9 +18142,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18159,9 +18159,9 @@ define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18176,9 +18176,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18193,9 +18193,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18210,9 +18210,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18227,9 +18227,9 @@ define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18244,9 +18244,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18261,10 +18261,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18279,10 +18279,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18297,10 +18297,10 @@ define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18315,10 +18315,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18333,10 +18333,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18351,10 +18351,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18369,10 +18369,10 @@ define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18387,10 +18387,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18405,10 +18405,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18423,10 +18423,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18441,10 +18441,10 @@ define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18459,10 +18459,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18477,9 +18477,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18494,9 +18494,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18511,9 +18511,9 @@ define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18528,9 +18528,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18545,9 +18545,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18562,9 +18562,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18579,9 +18579,9 @@ define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18596,9 +18596,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18613,9 +18613,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18630,9 +18630,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18647,9 +18647,9 @@ define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18664,9 +18664,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18681,9 +18681,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18698,9 +18698,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18715,9 +18715,9 @@ define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18732,9 +18732,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18749,9 +18749,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18766,9 +18766,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18783,9 +18783,9 @@ define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18800,9 +18800,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18817,9 +18817,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18834,9 +18834,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18851,9 +18851,9 @@ define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18868,9 +18868,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18885,10 +18885,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18903,10 +18903,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18921,10 +18921,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18939,10 +18939,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18957,10 +18957,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18975,10 +18975,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18993,10 +18993,10 @@ define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19011,10 +19011,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19029,10 +19029,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19047,10 +19047,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19065,10 +19065,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19083,10 +19083,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19101,10 +19101,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19119,10 +19119,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19137,10 +19137,10 @@ define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19155,10 +19155,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19173,10 +19173,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19191,10 +19191,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19209,10 +19209,10 @@ define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19227,10 +19227,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19245,10 +19245,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19263,10 +19263,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19281,10 +19281,10 @@ define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19299,10 +19299,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19317,10 +19317,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19335,10 +19335,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19353,10 +19353,10 @@ define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19371,10 +19371,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19389,10 +19389,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19407,10 +19407,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19425,10 +19425,10 @@ define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19443,10 +19443,10 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19461,10 +19461,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19479,10 +19479,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19497,10 +19497,10 @@ define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19515,10 +19515,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19533,10 +19533,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19551,10 +19551,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19569,10 +19569,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19587,10 +19587,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19605,10 +19605,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19623,10 +19623,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19641,10 +19641,10 @@ define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19659,10 +19659,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19677,10 +19677,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19695,10 +19695,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19713,10 +19713,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19731,10 +19731,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19748,9 +19748,9 @@ define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19764,9 +19764,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19780,9 +19780,9 @@ define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19796,9 +19796,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19812,9 +19812,9 @@ define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19828,9 +19828,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19844,9 +19844,9 @@ define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19860,9 +19860,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19876,9 +19876,9 @@ define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19892,9 +19892,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19908,9 +19908,9 @@ define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19924,9 +19924,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19940,9 +19940,9 @@ define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19956,9 +19956,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19972,9 +19972,9 @@ define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19988,9 +19988,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20004,9 +20004,9 @@ define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20020,9 +20020,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20036,9 +20036,9 @@ define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20052,9 +20052,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20068,9 +20068,9 @@ define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20084,9 +20084,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20100,9 +20100,9 @@ define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20116,9 +20116,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20132,10 +20132,10 @@ define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20149,10 +20149,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20166,10 +20166,10 @@ define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20183,10 +20183,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20200,10 +20200,10 @@ define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20217,10 +20217,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20234,10 +20234,10 @@ define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20251,10 +20251,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20268,10 +20268,10 @@ define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20285,10 +20285,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20302,10 +20302,10 @@ define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20319,10 +20319,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20336,9 +20336,9 @@ define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20352,9 +20352,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20368,9 +20368,9 @@ define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20384,9 +20384,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20400,9 +20400,9 @@ define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20416,9 +20416,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20432,9 +20432,9 @@ define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20448,9 +20448,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20464,9 +20464,9 @@ define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20480,9 +20480,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20496,9 +20496,9 @@ define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20512,9 +20512,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20528,9 +20528,9 @@ define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20544,9 +20544,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20560,9 +20560,9 @@ define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20576,9 +20576,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20592,9 +20592,9 @@ define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20608,9 +20608,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20624,9 +20624,9 @@ define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20640,9 +20640,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20656,9 +20656,9 @@ define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20672,9 +20672,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20688,9 +20688,9 @@ define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20704,9 +20704,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20720,10 +20720,10 @@ define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20737,10 +20737,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20754,10 +20754,10 @@ define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20771,10 +20771,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20788,10 +20788,10 @@ define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20805,10 +20805,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20822,10 +20822,10 @@ define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20839,10 +20839,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20856,10 +20856,10 @@ define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20873,10 +20873,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20890,10 +20890,10 @@ define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20907,10 +20907,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20924,9 +20924,9 @@ define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20940,9 +20940,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20956,9 +20956,9 @@ define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20972,9 +20972,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20988,9 +20988,9 @@ define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21004,9 +21004,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21020,9 +21020,9 @@ define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21036,9 +21036,9 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21052,9 +21052,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21068,9 +21068,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21084,9 +21084,9 @@ define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21100,9 +21100,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21116,9 +21116,9 @@ define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21132,9 +21132,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21148,9 +21148,9 @@ define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21164,9 +21164,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21180,9 +21180,9 @@ define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21196,9 +21196,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21212,9 +21212,9 @@ define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21228,9 +21228,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21244,9 +21244,9 @@ define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21260,9 +21260,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21276,9 +21276,9 @@ define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21292,9 +21292,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21308,10 +21308,10 @@ define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21325,10 +21325,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21342,10 +21342,10 @@ define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21359,10 +21359,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21376,10 +21376,10 @@ define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21393,10 +21393,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21410,10 +21410,10 @@ define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21427,10 +21427,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21444,10 +21444,10 @@ define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21461,10 +21461,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21478,10 +21478,10 @@ define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21495,10 +21495,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21512,9 +21512,9 @@ define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21528,9 +21528,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21544,9 +21544,9 @@ define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21560,9 +21560,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21576,9 +21576,9 @@ define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21592,9 +21592,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21608,9 +21608,9 @@ define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21624,9 +21624,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21640,9 +21640,9 @@ define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21656,9 +21656,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21672,9 +21672,9 @@ define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21688,9 +21688,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21704,9 +21704,9 @@ define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21720,9 +21720,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21736,9 +21736,9 @@ define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21752,9 +21752,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21768,9 +21768,9 @@ define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21784,9 +21784,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21800,9 +21800,9 @@ define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21816,9 +21816,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21832,9 +21832,9 @@ define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21848,9 +21848,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21864,9 +21864,9 @@ define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21880,9 +21880,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21896,10 +21896,10 @@ define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21913,10 +21913,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21930,10 +21930,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21947,10 +21947,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21964,10 +21964,10 @@ define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21981,10 +21981,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21998,10 +21998,10 @@ define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22015,10 +22015,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22032,10 +22032,10 @@ define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22049,10 +22049,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22066,10 +22066,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22083,10 +22083,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22100,10 +22100,10 @@ define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22117,10 +22117,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22134,10 +22134,10 @@ define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22151,10 +22151,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22168,10 +22168,10 @@ define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22185,10 +22185,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22202,10 +22202,10 @@ define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22219,10 +22219,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22236,10 +22236,10 @@ define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22253,10 +22253,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22270,10 +22270,10 @@ define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22287,10 +22287,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22304,10 +22304,10 @@ define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22321,10 +22321,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22338,10 +22338,10 @@ define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22355,10 +22355,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22372,10 +22372,10 @@ define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22389,10 +22389,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22406,10 +22406,10 @@ define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22423,10 +22423,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22440,10 +22440,10 @@ define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22457,10 +22457,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22474,10 +22474,10 @@ define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22491,10 +22491,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22508,10 +22508,10 @@ define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22525,10 +22525,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22542,10 +22542,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22559,10 +22559,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22576,10 +22576,10 @@ define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22593,10 +22593,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22610,10 +22610,10 @@ define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22627,10 +22627,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22644,10 +22644,10 @@ define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22661,10 +22661,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22678,10 +22678,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22695,10 +22695,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; From 5c7d41777fcf4217642a40e55a992fcb2997c4bc Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Wed, 21 May 2025 19:53:49 +0000 Subject: [PATCH 03/26] clang-format --- llvm/lib/Target/ARM/ARMISelLowering.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index c71b430b71874..f2e4207ff13ee 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -674,12 +674,14 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction *emitLeadingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction *emitTrailingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; unsigned getMaxSupportedInterleaveFactor() const override; From fbca61b9c762856167da7800267d3b64d91704e8 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 00:32:30 +0000 Subject: [PATCH 04/26] Update APIs in VE target --- llvm/lib/Target/VE/VEISelLowering.cpp | 6 ++++-- llvm/lib/Target/VE/VEISelLowering.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 9e8f400256198..51575264f899f 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1047,7 +1047,8 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -1068,7 +1069,8 @@ Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h index 04274b14baa1f..7390f84c6f8c2 100644 --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -203,10 +203,14 @@ class VETargetLowering : public TargetLowering { // VE uses release consistency, so need fence for each atomics. return true; } - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; ISD::NodeType getExtendForAtomicOps() const override { From 13018b71397fcdf3dcabe155d8d6c21ec8652e47 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 20:26:07 +0000 Subject: [PATCH 05/26] Fence scope is the same as cmpxchg scope- dont add API --- llvm/include/llvm/CodeGen/TargetLowering.h | 16 +++++++--------- llvm/lib/CodeGen/AtomicExpandPass.cpp | 15 ++++----------- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 ++++------ llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 ++---- llvm/lib/Target/ARM/ARMISelLowering.h | 12 ++++-------- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 +++++++-------- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 6 ++---- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 ++---- llvm/lib/Target/PowerPC/PPCISelLowering.h | 13 +++++-------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 ++---- llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 ++++-------- llvm/lib/Target/VE/VEISelLowering.cpp | 6 ++---- llvm/lib/Target/VE/VEISelLowering.h | 12 ++++-------- 13 files changed, 49 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d11e2ca22b189..9c3cede359c15 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2325,15 +2325,13 @@ class LLVM_ABI TargetLoweringBase { /// standard ABI uses a fence before a seq_cst load instead of after a /// seq_cst store). /// @{ - virtual Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const; - - virtual Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const; + virtual Instruction *emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const; + + virtual Instruction *emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index bc400b28d26af..3f3d5dc90711f 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -314,7 +314,6 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; - SyncScope::ID SSID = SyncScope::System; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); @@ -337,18 +336,13 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); - SSID = CASI->getSyncScopeID(); CASI->setSuccessOrdering(CASOrdering); CASI->setFailureOrdering(CASOrdering); - // If CAS ordering is monotonic, then the operation will - // take default scope. Otherwise, it will retain its scope - if (CASOrdering != AtomicOrdering::Monotonic) - CASI->setSyncScopeID(SSID); } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID); + MadeChange |= bracketInstWithFences(I, FenceOrdering); } } else if (I->hasAtomicStore() && TLI->shouldInsertTrailingFenceForAtomicStore(I)) { @@ -449,13 +443,12 @@ PreservedAnalyses AtomicExpandPass::run(Function &F, } bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, - AtomicOrdering Order, - SyncScope::ID SSID) { + AtomicOrdering Order) { ReplacementIRBuilder Builder(I, *DL); - auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); - auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); // We have a guard here because not every atomic operation generates a // trailing fence. if (TrailingFence) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 6c4a480b5ca87..0a077b7b61437 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2383,20 +2383,18 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) - return Builder.CreateFence(Ord, SSID); + return Builder.CreateFence(Ord); else return nullptr; } Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (isAcquireOrStronger(Ord)) - return Builder.CreateFence(Ord, SSID); + return Builder.CreateFence(Ord); else return nullptr; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1a409c3165f49..05ca11cfac5cb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21190,8 +21190,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -21216,8 +21215,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index f2e4207ff13ee..604910e04d4cc 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -674,14 +674,10 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d0df26087d2e5..f038483aa4298 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6266,13 +6266,13 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); // Specialize for cmpxchg // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) return Ord == AtomicOrdering::SequentiallyConsistent ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, @@ -6284,16 +6284,15 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { // Specialize for cmpxchg if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); - auto CASWidth = - cast( - dyn_cast(Inst)->getCompareOperand()->getType()) - ->getBitWidth(); + auto *CI = cast(Inst); + auto CASWidth = cast(CI->getCompareOperand()->getType()) + ->getBitWidth(); + SyncScope::ID SSID= CI->getSyncScopeID(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index d60d04e65c460..6a944116edb88 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -266,12 +266,10 @@ class NVPTXTargetLowering : public TargetLowering { Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + AtomicOrdering Ord) const override; Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + AtomicOrdering Ord) const override; unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 7e2bd684a3e06..b96505816dee8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12820,8 +12820,7 @@ Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder, // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -12831,8 +12830,7 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 3e99f8b8d21b6..4c88bd372b106 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -932,14 +932,11 @@ namespace llvm { Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; bool shouldInlineQuadwordAtomics() const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a08b4aac24e06..35fbac04b3405 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23306,8 +23306,7 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); @@ -23323,8 +23322,7 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 05ea2e5759f80..a1b283e35074a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -232,14 +232,10 @@ class RISCVTargetLowering : public TargetLowering { // than this hook due to limitations in the interface here. bool shouldInsertFencesForAtomic(const Instruction *I) const override; - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 51575264f899f..9e8f400256198 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1047,8 +1047,7 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -1069,8 +1068,7 @@ Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h index 7390f84c6f8c2..04274b14baa1f 100644 --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -203,14 +203,10 @@ class VETargetLowering : public TargetLowering { // VE uses release consistency, so need fence for each atomics. return true; } - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; ISD::NodeType getExtendForAtomicOps() const override { From af29d0785c911a4e0d9d28ccca9a5137fdd71606 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 20:46:56 +0000 Subject: [PATCH 06/26] test default syncscope --- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 9424 +++++++++++++++---- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 9434 +++++++++++++++---- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 10878 ++++++++++++++++------ llvm/test/CodeGen/NVPTX/cmpxchg.py | 37 +- 4 files changed, 23411 insertions(+), 6362 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 0281212659da0..11f79acb6060e 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,7 +68,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,7 +113,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -203,7 +203,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -248,7 +248,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -383,7 +383,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -446,15 +446,14 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -462,8 +461,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -474,9 +473,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -492,15 +491,14 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -508,8 +506,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -520,9 +518,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -538,15 +536,14 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -554,8 +551,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -566,9 +563,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -587,12 +584,12 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -600,8 +597,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -612,9 +609,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -630,15 +627,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -646,8 +643,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -658,9 +655,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -676,15 +673,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -692,8 +689,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -704,9 +701,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -722,15 +719,15 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -738,8 +735,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -750,9 +747,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -768,15 +765,15 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -784,8 +781,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -796,9 +793,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -814,15 +811,15 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -830,9 +827,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -843,9 +839,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -861,15 +857,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -877,9 +873,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -890,9 +885,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -908,15 +903,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -924,9 +919,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -937,9 +931,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -955,15 +949,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -971,9 +965,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -984,9 +977,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1005,12 +998,12 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1018,9 +1011,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1031,9 +1023,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1052,12 +1044,12 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1065,9 +1057,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1078,9 +1069,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1099,12 +1090,12 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1112,8 +1103,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1125,9 +1116,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1146,12 +1137,12 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1159,9 +1150,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1172,9 +1163,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1190,15 +1181,15 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1206,9 +1197,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1219,9 +1210,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1237,15 +1228,15 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1253,8 +1244,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1265,7 +1257,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1283,15 +1275,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1299,8 +1291,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1311,9 +1304,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1329,15 +1322,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1345,8 +1338,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1357,9 +1351,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1375,15 +1369,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_sys( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1391,8 +1385,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1403,7 +1398,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1421,15 +1416,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_cta( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1437,8 +1432,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1449,7 +1445,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1467,15 +1463,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1483,8 +1479,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1495,9 +1492,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1513,15 +1510,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1529,8 +1526,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1541,7 +1539,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1562,12 +1560,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1575,8 +1573,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1587,7 +1586,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1608,12 +1607,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1621,8 +1620,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1633,7 +1633,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1654,12 +1654,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1667,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1679,7 +1679,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1700,12 +1700,12 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1713,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1725,7 +1725,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1743,15 +1743,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1759,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1771,7 +1771,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1789,15 +1789,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1805,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1835,15 +1835,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_cta( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1851,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1863,7 +1863,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1881,15 +1881,15 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_gpu( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1897,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1909,7 +1909,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -1927,15 +1927,15 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1943,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1973,15 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_cta( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1989,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2019,15 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2035,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2047,7 +2047,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -2065,15 +2065,15 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2081,9 +2081,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2094,9 +2093,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2115,12 +2114,12 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2128,9 +2127,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2141,9 +2139,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2162,12 +2160,12 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2175,9 +2173,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2188,9 +2185,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2209,12 +2206,12 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2222,9 +2219,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2235,9 +2231,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2256,12 +2252,12 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2269,9 +2265,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2282,9 +2277,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2300,15 +2295,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2316,9 +2311,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2329,9 +2323,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2347,15 +2341,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2363,9 +2357,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2376,9 +2369,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2394,15 +2387,15 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2410,9 +2403,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2423,9 +2415,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2441,15 +2433,15 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2457,9 +2449,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2470,9 +2461,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2488,15 +2479,15 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2504,9 +2495,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2517,9 +2507,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2535,14 +2525,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2550,9 +2541,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2563,9 +2553,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2581,14 +2571,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2596,9 +2587,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2609,9 +2599,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2627,14 +2617,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2642,9 +2633,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2655,9 +2645,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2673,14 +2663,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2688,9 +2679,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2701,9 +2691,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2719,14 +2709,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2734,9 +2725,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2747,9 +2737,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2765,14 +2755,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2780,8 +2771,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2793,9 +2784,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2811,14 +2802,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2826,9 +2818,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2839,9 +2831,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2857,14 +2849,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2872,9 +2865,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2885,9 +2878,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2903,14 +2896,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2918,9 +2912,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2931,7 +2925,7 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -2949,15 +2943,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2965,9 +2959,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2978,9 +2972,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2996,15 +2990,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3012,9 +3006,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3025,9 +3019,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3043,15 +3037,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3059,9 +3053,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3072,7 +3066,7 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3090,15 +3084,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3106,9 +3100,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3119,7 +3113,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3137,15 +3131,15 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_gpu( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3153,9 +3147,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3166,9 +3160,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3184,15 +3178,15 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_sys( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3200,8 +3194,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3213,7 +3207,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3234,12 +3228,12 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_cta( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3247,8 +3241,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3260,7 +3254,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3281,12 +3275,12 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3294,8 +3288,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3307,7 +3301,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3328,12 +3322,12 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3341,8 +3335,8 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3354,7 +3348,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3372,15 +3366,14 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3388,9 +3381,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3401,7 +3394,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3419,15 +3412,14 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3435,9 +3427,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3448,7 +3440,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3466,25 +3458,24 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_sys( -; SM60: { +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( +; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; ; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3495,9 +3486,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3513,15 +3504,14 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_cta( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3529,9 +3519,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3542,7 +3532,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3560,15 +3550,14 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_gpu( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3576,9 +3565,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3589,7 +3578,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3607,15 +3596,14 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_sys( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3623,9 +3611,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3636,9 +3624,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,15 +3642,14 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_cta( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3670,9 +3657,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3683,9 +3670,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3701,15 +3688,14 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3717,9 +3703,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3730,7 +3716,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -3748,15 +3734,14 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3764,8 +3749,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3777,9 +3762,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3795,15 +3780,14 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3811,8 +3795,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3824,9 +3808,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3842,15 +3826,14 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3858,8 +3841,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3871,9 +3854,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3889,15 +3872,14 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3905,8 +3887,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3918,9 +3900,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3939,12 +3921,12 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3952,9 +3934,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3965,9 +3947,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3983,15 +3965,15 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3999,9 +3981,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4012,9 +3994,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4030,15 +4012,15 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4046,9 +4028,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4059,9 +4041,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4077,15 +4059,15 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4093,9 +4075,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4106,9 +4088,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4124,15 +4106,15 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4140,9 +4122,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4153,9 +4135,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4171,15 +4153,15 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4187,9 +4169,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4200,15 +4182,15 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB90_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4218,15 +4200,15 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB90_1; ; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4234,9 +4216,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4247,15 +4229,15 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB91_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4265,15 +4247,15 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB91_1; ; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4281,9 +4263,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4294,15 +4276,15 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB92_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4312,15 +4294,15 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB92_1; ; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4328,8 +4310,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4341,15 +4323,15 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB93_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4362,12 +4344,12 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4375,8 +4357,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4388,15 +4370,15 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB94_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4409,12 +4391,12 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4422,8 +4404,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4435,15 +4417,15 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB95_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4456,12 +4438,12 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4469,8 +4451,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4482,15 +4464,15 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB96_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4503,12 +4485,12 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4516,9 +4498,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4529,15 +4511,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB97_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4547,15 +4529,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB97_1; ; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4563,9 +4545,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4576,15 +4558,15 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB98_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4594,15 +4576,15 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB98_1; ; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4610,9 +4592,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4623,7 +4605,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -4641,15 +4623,15 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB99_1; ; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4657,9 +4639,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4670,15 +4652,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB100_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4688,15 +4670,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB100_1; ; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4704,9 +4686,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4717,15 +4699,15 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB101_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4735,15 +4717,15 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB101_1; ; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4751,9 +4733,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4764,7 +4746,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -4782,15 +4764,15 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB102_1; ; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4798,9 +4780,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4811,7 +4793,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -4829,15 +4811,15 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB103_1; ; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4845,9 +4827,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4858,15 +4840,15 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB104_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4876,15 +4858,15 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB104_1; ; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4892,8 +4874,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4905,7 +4887,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -4926,12 +4908,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4939,8 +4921,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4952,7 +4934,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -4973,12 +4955,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4986,8 +4968,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4999,7 +4981,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5020,12 +5002,12 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5033,8 +5015,8 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5046,7 +5028,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5067,12 +5049,12 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5080,9 +5062,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5093,7 +5075,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5111,15 +5093,15 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB109_1; ; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5127,9 +5109,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5140,7 +5122,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5158,15 +5140,15 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB110_1; ; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5174,9 +5156,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5187,15 +5169,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB111_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5205,15 +5187,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB111_1; ; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5221,9 +5203,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5234,7 +5216,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5252,15 +5234,15 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB112_1; ; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5268,9 +5250,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5281,7 +5263,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5299,15 +5281,15 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB113_1; ; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5315,9 +5297,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5328,15 +5310,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB114_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5346,15 +5328,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB114_1; ; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5362,9 +5344,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5375,15 +5357,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB115_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5393,15 +5375,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB115_1; ; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5409,9 +5391,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5422,7 +5404,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; @@ -5440,15 +5422,15 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB116_1; ; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5456,8 +5438,8 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5469,15 +5451,15 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB117_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5490,12 +5472,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5503,8 +5485,8 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5516,15 +5498,15 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB118_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5537,12 +5519,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5550,8 +5532,8 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5563,15 +5545,15 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB119_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5584,12 +5566,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_sys( +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5597,8 +5579,8 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5610,15 +5592,15 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB120_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5631,12 +5613,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5644,9 +5626,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5657,15 +5639,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB121_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5675,15 +5657,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB121_1; ; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5691,9 +5673,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5704,15 +5686,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB122_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5722,15 +5704,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB122_1; ; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5738,9 +5720,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5751,15 +5733,15 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB123_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5769,15 +5751,15 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB123_1; ; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5785,9 +5767,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5798,15 +5780,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB124_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5816,15 +5798,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB124_1; ; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5832,9 +5814,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5845,15 +5827,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB125_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5863,15 +5845,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB125_1; ; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5879,9 +5861,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5892,15 +5874,15 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB126_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5910,15 +5892,15 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB126_1; ; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5926,9 +5908,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5939,15 +5921,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB127_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5957,15 +5939,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB127_1; ; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5973,9 +5955,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5986,15 +5968,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB128_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6004,15 +5986,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB128_1; ; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6020,8 +6002,8 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6033,15 +6015,15 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB129_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6054,12 +6036,12 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6067,8 +6049,8 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6080,15 +6062,15 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB130_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6101,12 +6083,12 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6114,8 +6096,8 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6127,15 +6109,15 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB131_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6148,12 +6130,12 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6161,8 +6143,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6174,15 +6156,15 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB132_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6195,12 +6177,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6208,9 +6190,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6221,15 +6203,15 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB133_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6239,15 +6221,15 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB133_1; ; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6255,9 +6237,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6268,15 +6250,15 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB134_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6286,95 +6268,2254 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB134_1; ; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB135_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB135_1; ; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB136_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB136_1; +; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB137_1; +; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB136_3; +; SM60-NEXT: @%p1 bra $L__BB181_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB136_1; -; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -6405,26 +8546,70 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: @%p1 bra $L__BB183_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB137_1; -; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_monotonic_i16_global_sys( ; SM60: { @@ -6449,20 +8634,20 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: @%p1 bra $L__BB185_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB138_1; -; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -6493,20 +8678,20 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: @%p1 bra $L__BB186_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB139_1; -; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -6537,26 +8722,70 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: @%p1 bra $L__BB187_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB140_1; -; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_monotonic_i16_shared_sys( ; SM60: { @@ -6581,20 +8810,20 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: @%p1 bra $L__BB189_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB141_1; -; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -6610,10 +8839,54 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6625,28 +8898,28 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: @%p1 bra $L__BB191_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB142_1; -; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6654,10 +8927,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6667,25 +8940,26 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: @%p1 bra $L__BB192_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB143_1; -; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire ret i16 %new } @@ -6713,20 +8987,20 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: @%p1 bra $L__BB193_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB144_1; -; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6758,20 +9032,20 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: @%p1 bra $L__BB194_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB145_1; -; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6803,20 +9077,20 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: @%p1 bra $L__BB195_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB146_1; -; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6824,6 +9098,51 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_acquire_i16_global_sys( ; SM60: { @@ -6848,20 +9167,20 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: @%p1 bra $L__BB197_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB147_1; -; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6893,20 +9212,20 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: @%p1 bra $L__BB198_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB148_1; -; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6938,20 +9257,20 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: @%p1 bra $L__BB199_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB149_1; -; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -6959,6 +9278,51 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_acquire_i16_shared_sys( ; SM60: { @@ -6983,20 +9347,20 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: @%p1 bra $L__BB201_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB150_1; -; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7028,20 +9392,20 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: @%p1 bra $L__BB202_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB151_1; -; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7073,20 +9437,20 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: @%p1 bra $L__BB203_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB152_1; -; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7094,6 +9458,52 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM60: { @@ -7119,20 +9529,20 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: @%p1 bra $L__BB205_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB153_1; -; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7165,20 +9575,20 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: @%p1 bra $L__BB206_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB154_1; -; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7211,20 +9621,20 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: @%p1 bra $L__BB207_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB155_1; -; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7232,6 +9642,52 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_seq_cst_i16_global_sys( ; SM60: { @@ -7257,20 +9713,20 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: @%p1 bra $L__BB209_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB156_1; -; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7303,20 +9759,20 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: @%p1 bra $L__BB210_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB157_1; -; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7349,20 +9805,20 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: @%p1 bra $L__BB211_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB158_1; -; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7370,6 +9826,52 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM60: { @@ -7395,20 +9897,20 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: @%p1 bra $L__BB213_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB159_1; -; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7441,20 +9943,20 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: @%p1 bra $L__BB214_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB160_1; -; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7487,20 +9989,20 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: @%p1 bra $L__BB215_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB161_1; -; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7508,6 +10010,51 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_monotonic_i16_generic_sys( ; SM60: { @@ -7532,20 +10079,20 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: @%p1 bra $L__BB217_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB162_1; -; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7562,10 +10109,55 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7577,29 +10169,29 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: @%p1 bra $L__BB219_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB163_1; -; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7607,10 +10199,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7620,26 +10212,26 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: @%p1 bra $L__BB220_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB164_1; -; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic ret i16 %new } @@ -7667,20 +10259,20 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: @%p1 bra $L__BB221_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB165_1; -; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7712,20 +10304,20 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: @%p1 bra $L__BB222_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB166_1; -; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7757,20 +10349,20 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: @%p1 bra $L__BB223_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB167_1; -; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7778,6 +10370,51 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_monotonic_i16_shared_sys( ; SM60: { @@ -7802,20 +10439,20 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: @%p1 bra $L__BB225_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB168_1; -; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7847,20 +10484,20 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: @%p1 bra $L__BB226_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB169_1; -; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7892,20 +10529,20 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: @%p1 bra $L__BB227_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB170_1; -; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7913,6 +10550,51 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_acquire_i16_generic_sys( ; SM60: { @@ -7937,20 +10619,20 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: @%p1 bra $L__BB229_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB171_1; -; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -7982,20 +10664,20 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: @%p1 bra $L__BB230_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB172_1; -; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8027,20 +10709,20 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: @%p1 bra $L__BB231_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB173_1; -; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8048,6 +10730,51 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_acquire_i16_global_sys( ; SM60: { @@ -8072,20 +10799,20 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: @%p1 bra $L__BB233_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB174_1; -; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8117,20 +10844,20 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: @%p1 bra $L__BB234_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB175_1; -; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8162,20 +10889,20 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: @%p1 bra $L__BB235_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB176_1; -; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8183,6 +10910,51 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_acquire_i16_shared_sys( ; SM60: { @@ -8207,20 +10979,20 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: @%p1 bra $L__BB237_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB177_1; -; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8252,20 +11024,20 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: @%p1 bra $L__BB238_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB178_1; -; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8297,20 +11069,20 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: @%p1 bra $L__BB239_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB179_1; -; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8318,6 +11090,52 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_seq_cst_i16_generic_sys( ; SM60: { @@ -8343,20 +11161,20 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: @%p1 bra $L__BB241_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB180_1; -; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8389,20 +11207,20 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: @%p1 bra $L__BB242_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB181_1; -; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8435,20 +11253,20 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: @%p1 bra $L__BB243_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB182_1; -; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8456,6 +11274,52 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acquire_seq_cst_i16_global_sys( ; SM60: { @@ -8481,29 +11345,75 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: @%p1 bra $L__BB245_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB183_1; -; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_cta( +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8511,10 +11421,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8527,29 +11437,29 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: @%p1 bra $L__BB247_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB184_1; -; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8557,10 +11467,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8571,26 +11481,26 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: @%p1 bra $L__BB248_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB185_1; -; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } @@ -8619,20 +11529,20 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: @%p1 bra $L__BB249_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB186_1; -; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8665,20 +11575,20 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: @%p1 bra $L__BB250_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB187_1; -; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8711,20 +11621,20 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: @%p1 bra $L__BB251_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB188_1; -; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -8732,6 +11642,51 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_monotonic_i16_generic_sys( ; SM60: { @@ -8757,20 +11712,20 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: @%p1 bra $L__BB253_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB189_1; -; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -8802,20 +11757,20 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: @%p1 bra $L__BB254_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB190_1; -; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -8847,26 +11802,71 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: @%p1 bra $L__BB255_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB191_1; -; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_monotonic_i16_global_sys( ; SM60: { @@ -8892,20 +11892,20 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: @%p1 bra $L__BB257_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB192_1; -; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -8937,20 +11937,20 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: @%p1 bra $L__BB258_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB193_1; -; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -8980,25 +11980,70 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: @%p1 bra $L__BB260_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB194_1; -; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } @@ -9027,20 +12072,20 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: @%p1 bra $L__BB261_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB195_1; -; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -9072,20 +12117,20 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: @%p1 bra $L__BB262_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB196_1; -; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -9117,26 +12162,72 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: @%p1 bra $L__BB263_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB197_1; -; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_acquire_i16_generic_sys( ; SM60: { @@ -9162,20 +12253,20 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: @%p1 bra $L__BB265_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB198_1; -; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9208,20 +12299,20 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: @%p1 bra $L__BB266_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB199_1; -; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9254,20 +12345,20 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: @%p1 bra $L__BB267_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB200_1; -; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9275,6 +12366,52 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_acquire_i16_global_sys( ; SM60: { @@ -9300,20 +12437,20 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: @%p1 bra $L__BB269_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB201_1; -; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9346,20 +12483,20 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: @%p1 bra $L__BB270_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB202_1; -; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB270_1; +; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9392,20 +12529,20 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: @%p1 bra $L__BB271_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB203_1; -; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB271_1; +; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9413,6 +12550,52 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB272_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB272_1; +; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_acquire_i16_shared_sys( ; SM60: { @@ -9438,20 +12621,20 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: @%p1 bra $L__BB273_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB204_1; -; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB273_1; +; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9484,20 +12667,20 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: @%p1 bra $L__BB274_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB205_1; -; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB274_1; +; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9530,24 +12713,70 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB275_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB275_1; +; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: @%p1 bra $L__BB276_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB206_1; -; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB276_1; +; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } @@ -9576,20 +12805,20 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: @%p1 bra $L__BB277_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB207_1; -; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB277_1; +; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9622,20 +12851,20 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: @%p1 bra $L__BB278_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB208_1; -; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB278_1; +; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9668,20 +12897,20 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: @%p1 bra $L__BB279_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB209_1; -; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB279_1; +; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9689,6 +12918,52 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB280_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB280_1; +; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_seq_cst_i16_global_sys( ; SM60: { @@ -9714,20 +12989,20 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: @%p1 bra $L__BB281_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB210_1; -; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB281_1; +; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9760,20 +13035,20 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: @%p1 bra $L__BB282_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB211_1; -; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB282_1; +; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9806,20 +13081,20 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: @%p1 bra $L__BB283_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB212_1; -; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB283_1; +; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9827,6 +13102,52 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB284_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB284_1; +; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: release_seq_cst_i16_shared_sys( ; SM60: { @@ -9852,20 +13173,20 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: @%p1 bra $L__BB285_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB213_1; -; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB285_1; +; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9898,20 +13219,20 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: @%p1 bra $L__BB286_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB214_1; -; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB286_1; +; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9944,20 +13265,20 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: @%p1 bra $L__BB287_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB215_1; -; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB287_1; +; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -9965,6 +13286,52 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB288_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB288_1; +; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM60: { @@ -9990,20 +13357,20 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: @%p1 bra $L__BB289_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB216_1; -; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB289_1; +; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10036,20 +13403,20 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: @%p1 bra $L__BB290_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB217_1; -; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB290_1; +; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10082,20 +13449,20 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: @%p1 bra $L__BB291_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB218_1; -; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB291_1; +; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10103,6 +13470,52 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB292_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB292_1; +; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_monotonic_i16_global_sys( ; SM60: { @@ -10128,20 +13541,20 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: @%p1 bra $L__BB293_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB219_1; -; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB293_1; +; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10174,20 +13587,20 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: @%p1 bra $L__BB294_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB220_1; -; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB294_1; +; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10220,24 +13633,70 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB295_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB295_1; +; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: @%p1 bra $L__BB296_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB221_1; -; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB296_1; +; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } @@ -10266,20 +13725,20 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: @%p1 bra $L__BB297_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB222_1; -; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB297_1; +; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10312,20 +13771,20 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: @%p1 bra $L__BB298_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB223_1; -; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB298_1; +; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10358,20 +13817,20 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: @%p1 bra $L__BB299_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB224_1; -; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB299_1; +; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10379,6 +13838,52 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB300_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB300_1; +; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_acquire_i16_generic_sys( ; SM60: { @@ -10404,20 +13909,20 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: @%p1 bra $L__BB301_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB225_1; -; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB301_1; +; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10450,20 +13955,20 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: @%p1 bra $L__BB302_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB226_1; -; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB302_1; +; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10496,20 +14001,20 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB227_3; +; SM60-NEXT: @%p1 bra $L__BB303_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB227_1; -; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB303_1; +; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10517,6 +14022,52 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB304_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB304_1; +; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_acquire_i16_global_sys( ; SM60: { @@ -10542,20 +14093,20 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: @%p1 bra $L__BB305_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB228_1; -; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB305_1; +; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10588,20 +14139,20 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: @%p1 bra $L__BB306_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB229_1; -; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB306_1; +; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10634,20 +14185,20 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB230_3; +; SM60-NEXT: @%p1 bra $L__BB307_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB230_1; -; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB307_1; +; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10655,6 +14206,52 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB308_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB308_1; +; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_acquire_i16_shared_sys( ; SM60: { @@ -10680,20 +14277,20 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: @%p1 bra $L__BB309_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB231_1; -; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB309_1; +; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10726,20 +14323,20 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: @%p1 bra $L__BB310_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB232_1; -; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB310_1; +; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10772,20 +14369,20 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: @%p1 bra $L__BB311_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB233_1; -; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB311_1; +; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10793,6 +14390,52 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB312_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB312_1; +; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM60: { @@ -10818,20 +14461,20 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: @%p1 bra $L__BB313_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB234_1; -; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB313_1; +; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10864,20 +14507,20 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: @%p1 bra $L__BB314_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB235_1; -; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB314_1; +; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -10908,26 +14551,72 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB315_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB315_1; +; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: @%p1 bra $L__BB316_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB236_1; -; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB316_1; +; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst ret i16 %new } @@ -10956,20 +14645,20 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: @%p1 bra $L__BB317_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB237_1; -; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB317_1; +; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11002,20 +14691,20 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: @%p1 bra $L__BB318_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB238_1; -; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB318_1; +; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11048,20 +14737,20 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: @%p1 bra $L__BB319_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB239_1; -; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB319_1; +; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11069,6 +14758,52 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB320_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB320_1; +; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM60: { @@ -11094,20 +14829,20 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: @%p1 bra $L__BB321_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB240_1; -; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB321_1; +; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11140,20 +14875,20 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: @%p1 bra $L__BB322_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB241_1; -; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB322_1; +; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11186,20 +14921,20 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: @%p1 bra $L__BB323_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB242_1; -; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB323_1; +; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11207,6 +14942,52 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB324_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB324_1; +; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( ; SM60: { @@ -11232,20 +15013,20 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: @%p1 bra $L__BB325_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB243_1; -; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB325_1; +; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11278,20 +15059,20 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: @%p1 bra $L__BB326_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB244_1; -; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB326_1; +; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11324,20 +15105,20 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: @%p1 bra $L__BB327_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB245_1; -; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB327_1; +; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11345,6 +15126,52 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB328_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB328_1; +; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_monotonic_i16_global_sys( ; SM60: { @@ -11370,20 +15197,20 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: @%p1 bra $L__BB329_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB246_1; -; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB329_1; +; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11416,20 +15243,20 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: @%p1 bra $L__BB330_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB247_1; -; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB330_1; +; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11462,20 +15289,20 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: @%p1 bra $L__BB331_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB248_1; -; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB331_1; +; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11483,6 +15310,52 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB332_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB332_1; +; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM60: { @@ -11508,20 +15381,20 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: @%p1 bra $L__BB333_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB249_1; -; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB333_1; +; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11554,20 +15427,20 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: @%p1 bra $L__BB334_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB250_1; -; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB334_1; +; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11598,26 +15471,72 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB335_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB335_1; +; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: @%p1 bra $L__BB336_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB251_1; -; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB336_1; +; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } @@ -11646,20 +15565,20 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: @%p1 bra $L__BB337_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB252_1; -; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB337_1; +; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11692,20 +15611,20 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: @%p1 bra $L__BB338_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB253_1; -; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB338_1; +; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11738,20 +15657,20 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: @%p1 bra $L__BB339_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB254_1; -; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB339_1; +; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11759,6 +15678,52 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB340_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB340_1; +; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_acquire_i16_global_sys( ; SM60: { @@ -11784,20 +15749,20 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: @%p1 bra $L__BB341_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB255_1; -; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB341_1; +; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11830,20 +15795,20 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: @%p1 bra $L__BB342_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB256_1; -; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB342_1; +; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11876,20 +15841,20 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: @%p1 bra $L__BB343_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB257_1; -; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB343_1; +; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11897,6 +15862,52 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB344_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB344_1; +; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_acquire_i16_shared_sys( ; SM60: { @@ -11922,20 +15933,20 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: @%p1 bra $L__BB345_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB258_1; -; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB345_1; +; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -11968,20 +15979,20 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: @%p1 bra $L__BB346_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB259_1; -; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB346_1; +; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12014,20 +16025,20 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: @%p1 bra $L__BB347_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB260_1; -; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB347_1; +; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12035,6 +16046,52 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB348_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB348_1; +; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM60: { @@ -12060,20 +16117,20 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: @%p1 bra $L__BB349_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB261_1; -; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB349_1; +; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12106,20 +16163,20 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: @%p1 bra $L__BB350_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB262_1; -; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB350_1; +; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12152,20 +16209,20 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: @%p1 bra $L__BB351_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB263_1; -; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB351_1; +; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12173,6 +16230,52 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB352_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB352_1; +; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM60: { @@ -12198,20 +16301,20 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: @%p1 bra $L__BB353_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB264_1; -; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB353_1; +; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12244,20 +16347,20 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: @%p1 bra $L__BB354_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB265_1; -; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB354_1; +; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12288,26 +16391,72 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB355_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB355_1; +; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: @%p1 bra $L__BB356_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB266_1; -; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: @%p2 bra $L__BB356_1; +; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -12336,20 +16485,20 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: @%p1 bra $L__BB357_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB267_1; -; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB357_1; +; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12382,20 +16531,20 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: @%p1 bra $L__BB358_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB268_1; -; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB358_1; +; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12428,20 +16577,20 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: @%p1 bra $L__BB359_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB269_1; -; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: @%p2 bra $L__BB359_1; +; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12449,6 +16598,23 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_generic_sys( ; SM60: { @@ -12500,6 +16666,23 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_global_sys( ; SM60: { @@ -12551,6 +16734,23 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_shared_sys( ; SM60: { @@ -12602,6 +16802,23 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_generic_sys( ; SM60: { @@ -12653,6 +16870,23 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_global_sys( ; SM60: { @@ -12704,6 +16938,23 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_shared_sys( ; SM60: { @@ -12755,6 +17006,24 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM60: { @@ -12809,6 +17078,24 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_global_sys( ; SM60: { @@ -12863,6 +17150,24 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM60: { @@ -12917,6 +17222,23 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_generic_sys( ; SM60: { @@ -12968,6 +17290,23 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_global_sys( ; SM60: { @@ -13019,6 +17358,23 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_shared_sys( ; SM60: { @@ -13070,6 +17426,23 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_generic_sys( ; SM60: { @@ -13121,6 +17494,23 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_global_sys( ; SM60: { @@ -13172,6 +17562,23 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_shared_sys( ; SM60: { @@ -13223,6 +17630,24 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_generic_sys( ; SM60: { @@ -13277,6 +17702,24 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_global_sys( ; SM60: { @@ -13331,6 +17774,24 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_shared_sys( ; SM60: { @@ -13385,6 +17846,23 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_generic_sys( ; SM60: { @@ -13432,7 +17910,24 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic ret i32 %new } @@ -13487,6 +17982,23 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_shared_sys( ; SM60: { @@ -13538,6 +18050,23 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_generic_sys( ; SM60: { @@ -13589,6 +18118,23 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_global_sys( ; SM60: { @@ -13640,6 +18186,23 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_shared_sys( ; SM60: { @@ -13691,6 +18254,24 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_generic_sys( ; SM60: { @@ -13745,6 +18326,24 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_global_sys( ; SM60: { @@ -13799,6 +18398,24 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_shared_sys( ; SM60: { @@ -13853,6 +18470,23 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM60: { @@ -13904,6 +18538,23 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_global_sys( ; SM60: { @@ -13955,6 +18606,23 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM60: { @@ -14006,6 +18674,23 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_generic_sys( ; SM60: { @@ -14057,6 +18742,23 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_global_sys( ; SM60: { @@ -14108,6 +18810,23 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_shared_sys( ; SM60: { @@ -14159,6 +18878,24 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM60: { @@ -14213,6 +18950,24 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM60: { @@ -14267,6 +19022,24 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM60: { @@ -14321,6 +19094,24 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM60: { @@ -14375,6 +19166,24 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_global_sys( ; SM60: { @@ -14429,6 +19238,24 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM60: { @@ -14483,6 +19310,24 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_generic_sys( ; SM60: { @@ -14537,6 +19382,24 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_global_sys( ; SM60: { @@ -14587,7 +19450,25 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire ret i32 %new } @@ -14645,6 +19526,24 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM60: { @@ -14699,6 +19598,24 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM60: { @@ -14753,6 +19670,24 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM60: { @@ -14807,6 +19742,22 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_generic_sys( ; SM60: { @@ -14855,6 +19806,22 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_global_sys( ; SM60: { @@ -14903,6 +19870,22 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_shared_sys( ; SM60: { @@ -14951,6 +19934,22 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_generic_sys( ; SM60: { @@ -14999,6 +19998,22 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_global_sys( ; SM60: { @@ -15047,6 +20062,22 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_shared_sys( ; SM60: { @@ -15095,6 +20126,23 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM60: { @@ -15146,6 +20194,23 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_global_sys( ; SM60: { @@ -15197,6 +20262,23 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM60: { @@ -15248,6 +20330,22 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_generic_sys( ; SM60: { @@ -15296,6 +20394,22 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_global_sys( ; SM60: { @@ -15344,6 +20458,22 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_shared_sys( ; SM60: { @@ -15392,6 +20522,22 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_generic_sys( ; SM60: { @@ -15440,6 +20586,22 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_global_sys( ; SM60: { @@ -15488,6 +20650,22 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_shared_sys( ; SM60: { @@ -15536,6 +20714,23 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_generic_sys( ; SM60: { @@ -15587,6 +20782,23 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_global_sys( ; SM60: { @@ -15638,6 +20850,23 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_shared_sys( ; SM60: { @@ -15689,6 +20918,22 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_generic_sys( ; SM60: { @@ -15737,6 +20982,22 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_global_sys( ; SM60: { @@ -15781,7 +21042,23 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic ret i64 %new } @@ -15833,6 +21110,22 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { @@ -15881,6 +21174,22 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { @@ -15929,6 +21238,22 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_shared_sys( ; SM60: { @@ -15977,6 +21302,23 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { @@ -16028,6 +21370,23 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { @@ -16079,6 +21438,23 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { @@ -16130,6 +21506,22 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { @@ -16178,6 +21570,22 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { @@ -16226,6 +21634,22 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { @@ -16274,6 +21698,22 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { @@ -16322,6 +21762,22 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { @@ -16370,6 +21826,22 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { @@ -16418,6 +21890,23 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { @@ -16469,6 +21958,23 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { @@ -16520,6 +22026,23 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { @@ -16571,6 +22094,23 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { @@ -16622,6 +22162,23 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { @@ -16673,6 +22230,23 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { @@ -16724,6 +22298,23 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { @@ -16775,6 +22366,23 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { @@ -16826,6 +22434,23 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { @@ -16877,6 +22502,23 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { @@ -16928,6 +22570,23 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { @@ -16979,6 +22638,23 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index ddedc7ea36252..30f3b02b89e77 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,7 +68,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,7 +113,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -203,7 +203,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -248,7 +248,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -383,7 +383,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -446,15 +446,14 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -462,8 +461,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -474,9 +473,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -492,15 +491,14 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -508,8 +506,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -520,9 +518,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -538,15 +536,14 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -554,8 +551,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -566,9 +563,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -587,12 +584,12 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -600,8 +597,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -612,9 +609,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -630,15 +627,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -646,8 +643,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -658,9 +655,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -676,15 +673,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -692,8 +689,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -704,9 +701,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -722,15 +719,15 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -738,8 +735,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -750,9 +747,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -768,15 +765,15 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -784,8 +781,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -796,9 +793,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -814,15 +811,15 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -830,9 +827,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -843,9 +839,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -861,15 +857,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -877,9 +873,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -890,9 +885,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -908,15 +903,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -924,9 +919,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -937,9 +931,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -955,15 +949,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -971,9 +965,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -984,9 +977,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1005,12 +998,12 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1018,9 +1011,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1031,9 +1023,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1052,12 +1044,12 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1065,9 +1057,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1078,9 +1069,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1099,12 +1090,12 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1112,8 +1103,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1125,9 +1116,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1146,12 +1137,12 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1159,9 +1150,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1172,9 +1163,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1190,15 +1181,15 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1206,9 +1197,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1219,9 +1210,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1237,15 +1228,15 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1253,8 +1244,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1265,7 +1257,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1283,15 +1275,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1299,8 +1291,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1311,9 +1304,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1329,15 +1322,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1345,8 +1338,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1357,9 +1351,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1375,15 +1369,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_sys( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1391,8 +1385,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1403,7 +1398,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1421,15 +1416,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_cta( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1437,8 +1432,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1449,7 +1445,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1467,15 +1463,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1483,8 +1479,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1495,9 +1492,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1513,15 +1510,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1529,8 +1526,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1541,7 +1539,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1562,12 +1560,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1575,8 +1573,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1587,7 +1586,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1608,12 +1607,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1621,8 +1620,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1633,7 +1633,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1654,12 +1654,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1667,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1679,7 +1679,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1700,12 +1700,12 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1713,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1725,7 +1725,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1743,15 +1743,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1759,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1771,7 +1771,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1789,15 +1789,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1805,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1835,15 +1835,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_cta( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1851,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1863,7 +1863,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1881,15 +1881,15 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_gpu( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1897,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1909,7 +1909,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -1927,15 +1927,15 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1943,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1973,15 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_cta( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1989,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2019,15 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2035,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2047,7 +2047,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -2065,15 +2065,15 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2081,9 +2081,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2094,9 +2093,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2115,12 +2114,12 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2128,9 +2127,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2141,9 +2139,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2162,12 +2160,12 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2175,9 +2173,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2188,9 +2185,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2209,12 +2206,12 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2222,9 +2219,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2235,9 +2231,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2256,12 +2252,12 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2269,9 +2265,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2282,9 +2277,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2300,15 +2295,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2316,9 +2311,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2329,9 +2323,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2347,15 +2341,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2363,9 +2357,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2376,9 +2369,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2394,15 +2387,15 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2410,9 +2403,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2423,9 +2415,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2441,15 +2433,15 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2457,9 +2449,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2470,9 +2461,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2488,15 +2479,15 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2504,9 +2495,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2517,9 +2507,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2535,14 +2525,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2550,9 +2541,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2563,9 +2553,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2581,14 +2571,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2596,9 +2587,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2609,9 +2599,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2627,14 +2617,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2642,9 +2633,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2655,9 +2645,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2673,14 +2663,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2688,9 +2679,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2701,9 +2691,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2719,14 +2709,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2734,9 +2725,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2747,9 +2737,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2765,14 +2755,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2780,9 +2771,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2793,9 +2784,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2811,14 +2802,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2826,9 +2818,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2839,9 +2831,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2857,14 +2849,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2872,9 +2865,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2885,9 +2878,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2903,14 +2896,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2918,9 +2912,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2931,7 +2925,7 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -2949,15 +2943,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2965,9 +2959,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2978,9 +2972,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2996,15 +2990,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3012,9 +3006,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3025,9 +3019,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3043,15 +3037,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3059,9 +3053,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3072,7 +3066,7 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3090,15 +3084,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3106,9 +3100,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3119,7 +3113,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3137,15 +3131,15 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_gpu( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3153,9 +3147,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3166,9 +3160,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3184,15 +3178,15 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_sys( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3200,9 +3194,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3213,7 +3207,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3234,12 +3228,12 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_cta( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3247,9 +3241,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3260,7 +3254,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3281,12 +3275,12 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3294,9 +3288,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3307,7 +3301,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3328,12 +3322,12 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3341,9 +3335,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3354,7 +3348,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3372,15 +3366,14 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3388,9 +3381,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3401,7 +3394,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3419,15 +3412,14 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3435,10 +3427,10 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; ; SM70-NEXT: shl.b32 %r1, %r10, 3; @@ -3448,7 +3440,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3466,15 +3458,14 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_sys( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3482,9 +3473,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3495,9 +3486,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3513,15 +3504,14 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_cta( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3529,9 +3519,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3542,7 +3532,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3560,15 +3550,14 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_gpu( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3576,9 +3565,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3589,7 +3578,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3607,15 +3596,14 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_sys( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3623,9 +3611,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3636,9 +3624,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,15 +3642,14 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_cta( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3670,9 +3657,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3683,9 +3670,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3701,15 +3688,14 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3717,9 +3703,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3730,7 +3716,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -3748,15 +3734,14 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3764,8 +3749,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3777,9 +3762,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3795,15 +3780,14 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3811,8 +3795,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3824,9 +3808,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3842,15 +3826,14 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3858,8 +3841,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3871,9 +3854,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3889,15 +3872,14 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3905,8 +3887,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3918,9 +3900,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3939,12 +3921,12 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3952,9 +3934,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3965,9 +3947,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3983,15 +3965,15 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3999,9 +3981,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4012,9 +3994,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4030,15 +4012,15 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4046,9 +4028,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4059,9 +4041,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4077,15 +4059,15 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4093,9 +4075,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4106,9 +4088,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4124,15 +4106,15 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4140,9 +4122,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4153,9 +4135,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4171,15 +4153,15 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4187,9 +4169,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4200,15 +4182,15 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB90_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4218,15 +4200,15 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB90_1; ; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4234,9 +4216,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4247,15 +4229,15 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB91_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4265,15 +4247,15 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB91_1; ; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4281,9 +4263,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4294,15 +4276,15 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB92_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4312,15 +4294,15 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB92_1; ; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4328,8 +4310,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4341,15 +4323,15 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB93_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4362,12 +4344,12 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4375,8 +4357,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4388,15 +4370,15 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB94_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4409,12 +4391,12 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4422,8 +4404,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4435,15 +4417,15 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB95_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4456,12 +4438,12 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4469,9 +4451,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4482,15 +4464,15 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB96_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4503,12 +4485,12 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4516,9 +4498,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4529,15 +4511,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB97_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4547,15 +4529,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB97_1; ; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4563,9 +4545,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4576,15 +4558,15 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB98_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4594,15 +4576,15 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB98_1; ; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4610,9 +4592,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4623,7 +4605,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -4631,7 +4613,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB99_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4641,15 +4623,15 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB99_1; ; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4657,9 +4639,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4670,15 +4652,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB100_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4688,15 +4670,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB100_1; ; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4704,9 +4686,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4717,15 +4699,15 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB101_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4735,15 +4717,15 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB101_1; ; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4751,9 +4733,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4764,7 +4746,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -4772,7 +4754,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB102_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4782,15 +4764,15 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB102_1; ; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4798,9 +4780,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4811,7 +4793,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -4819,7 +4801,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB103_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4829,15 +4811,15 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB103_1; ; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4845,9 +4827,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4858,15 +4840,15 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB104_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4876,15 +4858,15 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB104_1; ; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4892,8 +4874,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4905,7 +4887,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -4926,12 +4908,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4939,8 +4921,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4952,7 +4934,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -4973,12 +4955,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4986,8 +4968,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4999,7 +4981,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5020,12 +5002,12 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5033,9 +5015,9 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5046,7 +5028,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5067,12 +5049,12 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5080,9 +5062,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5093,7 +5075,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5101,7 +5083,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB109_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5111,15 +5093,15 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB109_1; ; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5127,9 +5109,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5140,7 +5122,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5148,7 +5130,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB110_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5158,15 +5140,15 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB110_1; ; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_sys( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5174,9 +5156,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5187,15 +5169,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB111_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5205,15 +5187,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB111_1; ; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_cta( +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5221,9 +5203,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5234,7 +5216,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5242,7 +5224,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB112_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5252,15 +5234,15 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB112_1; ; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5268,9 +5250,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5281,7 +5263,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5289,7 +5271,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB113_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5299,15 +5281,15 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB113_1; ; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5315,9 +5297,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5328,15 +5310,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB114_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5346,15 +5328,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB114_1; ; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5362,9 +5344,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5375,15 +5357,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB115_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5393,15 +5375,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB115_1; ; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5409,9 +5391,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5422,7 +5404,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; @@ -5430,7 +5412,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB116_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5440,15 +5422,15 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB116_1; ; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5456,9 +5438,9 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5469,15 +5451,15 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB117_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5490,12 +5472,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5503,9 +5485,9 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5516,15 +5498,15 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB118_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5537,12 +5519,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5550,9 +5532,9 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5563,15 +5545,15 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB119_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5584,12 +5566,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_sys( +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5597,9 +5579,9 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5610,15 +5592,15 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB120_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5631,12 +5613,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5644,9 +5626,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5657,15 +5639,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB121_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5675,15 +5657,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB121_1; ; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5691,9 +5673,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5704,15 +5686,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB122_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5722,15 +5704,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB122_1; ; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5738,9 +5720,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5751,15 +5733,15 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB123_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5769,15 +5751,15 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB123_1; ; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5785,9 +5767,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5798,15 +5780,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB124_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5816,15 +5798,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB124_1; ; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5832,9 +5814,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5845,15 +5827,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB125_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5863,15 +5845,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB125_1; ; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5879,9 +5861,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5892,15 +5874,15 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB126_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5910,15 +5892,15 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB126_1; ; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5926,9 +5908,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5939,15 +5921,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB127_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5957,15 +5939,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB127_1; ; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5973,9 +5955,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5986,15 +5968,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB128_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6004,15 +5986,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB128_1; ; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6020,9 +6002,9 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6033,15 +6015,15 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB129_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6054,12 +6036,12 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6067,9 +6049,9 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6080,15 +6062,15 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB130_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6101,12 +6083,12 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6114,9 +6096,9 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6127,15 +6109,15 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB131_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6148,12 +6130,12 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6161,8 +6143,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6174,15 +6156,15 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB132_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6195,12 +6177,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6208,9 +6190,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6221,15 +6203,15 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB133_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6239,15 +6221,15 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB133_1; ; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6255,9 +6237,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6268,15 +6250,15 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB134_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6286,51 +6268,2210 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB134_1; ; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_sys( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<21>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB135_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB135_1; +; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB136_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB136_1; +; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB137_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB137_1; +; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB138_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB138_1; +; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB139_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB139_1; +; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB140_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB140_1; +; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB141_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB141_1; +; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB142_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB142_1; +; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB143_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB143_1; +; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB144_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB144_1; +; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB145_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB145_1; +; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB146_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB146_1; +; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB147_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB147_1; +; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB148_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB148_1; +; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB149_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB149_1; +; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB150_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB150_1; +; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB151_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB151_1; +; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB152_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB152_1; +; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB153_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB153_1; +; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB154_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB154_1; +; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB155_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB155_1; +; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB156_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB156_1; +; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB157_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB157_1; +; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB158_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB158_1; +; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB159_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB159_1; +; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB160_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB160_1; +; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB161_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB161_1; +; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB162_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB162_1; +; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB163_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB163_1; +; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB164_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB164_1; +; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB165_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB165_1; +; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB166_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB166_1; +; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB167_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB167_1; +; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB168_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB168_1; +; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB169_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB169_1; +; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB170_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB170_1; +; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB171_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB171_1; +; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB172_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB172_1; +; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB173_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB173_1; +; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB174_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB174_1; +; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB175_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB175_1; +; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB176_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB176_1; +; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB177_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB177_1; +; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB178_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB178_1; +; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB179_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB179_1; +; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB180_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB135_1; -; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB180_1; +; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_sys( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB181_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB181_1; +; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -6361,20 +8502,20 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB136_3; +; SM70-NEXT: @%p1 bra $L__BB182_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB136_1; -; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB182_1; +; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -6405,26 +8546,70 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB137_3; +; SM70-NEXT: @%p1 bra $L__BB183_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB137_1; -; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB183_1; +; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB184_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB184_1; +; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_monotonic_i16_global_sys( ; SM70: { @@ -6449,20 +8634,20 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB138_3; +; SM70-NEXT: @%p1 bra $L__BB185_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB138_1; -; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB185_1; +; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -6493,20 +8678,20 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB139_3; +; SM70-NEXT: @%p1 bra $L__BB186_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB139_1; -; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB186_1; +; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -6537,26 +8722,70 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB140_3; +; SM70-NEXT: @%p1 bra $L__BB187_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB140_1; -; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB187_1; +; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB188_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB188_1; +; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_monotonic_i16_shared_sys( ; SM70: { @@ -6581,20 +8810,20 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB141_3; +; SM70-NEXT: @%p1 bra $L__BB189_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB141_1; -; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB189_1; +; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -6613,7 +8842,51 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB190_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB190_1; +; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6625,28 +8898,28 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB142_3; +; SM70-NEXT: @%p1 bra $L__BB191_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB142_1; -; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB191_1; +; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6654,10 +8927,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6667,25 +8940,26 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB143_3; +; SM70-NEXT: @%p1 bra $L__BB192_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB143_1; -; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB192_1; +; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire ret i16 %new } @@ -6713,20 +8987,20 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB144_3; +; SM70-NEXT: @%p1 bra $L__BB193_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB144_1; -; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB193_1; +; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6758,20 +9032,20 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB145_3; +; SM70-NEXT: @%p1 bra $L__BB194_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB145_1; -; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB194_1; +; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6803,20 +9077,20 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB146_3; +; SM70-NEXT: @%p1 bra $L__BB195_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB146_1; -; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB195_1; +; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6824,6 +9098,51 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB196_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB196_1; +; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_acquire_i16_global_sys( ; SM70: { @@ -6848,20 +9167,20 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB147_3; +; SM70-NEXT: @%p1 bra $L__BB197_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB147_1; -; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB197_1; +; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6893,20 +9212,20 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB148_3; +; SM70-NEXT: @%p1 bra $L__BB198_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB148_1; -; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB198_1; +; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6938,20 +9257,20 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB149_3; +; SM70-NEXT: @%p1 bra $L__BB199_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB149_1; -; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB199_1; +; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -6959,6 +9278,51 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB200_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB200_1; +; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_acquire_i16_shared_sys( ; SM70: { @@ -6983,20 +9347,20 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB150_3; +; SM70-NEXT: @%p1 bra $L__BB201_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB150_1; -; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB201_1; +; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7028,20 +9392,20 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB151_3; +; SM70-NEXT: @%p1 bra $L__BB202_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB151_1; -; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB202_1; +; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7073,20 +9437,20 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB152_3; +; SM70-NEXT: @%p1 bra $L__BB203_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB152_1; -; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB203_1; +; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7094,6 +9458,52 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB204_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB204_1; +; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM70: { @@ -7119,20 +9529,20 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB153_3; +; SM70-NEXT: @%p1 bra $L__BB205_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB153_1; -; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB205_1; +; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7165,20 +9575,20 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB154_3; +; SM70-NEXT: @%p1 bra $L__BB206_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB154_1; -; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB206_1; +; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7211,20 +9621,20 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB155_3; +; SM70-NEXT: @%p1 bra $L__BB207_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB155_1; -; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB207_1; +; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7232,6 +9642,52 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB208_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB208_1; +; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_seq_cst_i16_global_sys( ; SM70: { @@ -7257,20 +9713,20 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB156_3; +; SM70-NEXT: @%p1 bra $L__BB209_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB156_1; -; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB209_1; +; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7303,20 +9759,20 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB157_3; +; SM70-NEXT: @%p1 bra $L__BB210_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB157_1; -; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB210_1; +; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7349,20 +9805,20 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB158_3; +; SM70-NEXT: @%p1 bra $L__BB211_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB158_1; -; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB211_1; +; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7370,6 +9826,52 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB212_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB212_1; +; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM70: { @@ -7395,20 +9897,20 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB159_3; +; SM70-NEXT: @%p1 bra $L__BB213_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB159_1; -; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB213_1; +; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7441,20 +9943,20 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB160_3; +; SM70-NEXT: @%p1 bra $L__BB214_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB160_1; -; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB214_1; +; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7487,20 +9989,20 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB161_3; +; SM70-NEXT: @%p1 bra $L__BB215_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB161_1; -; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB215_1; +; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7508,6 +10010,51 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB216_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB216_1; +; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_monotonic_i16_generic_sys( ; SM70: { @@ -7532,20 +10079,20 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB162_3; +; SM70-NEXT: @%p1 bra $L__BB217_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB162_1; -; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB217_1; +; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7562,10 +10109,55 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB218_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB218_1; +; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_gpu( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7577,29 +10169,29 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB163_3; +; SM70-NEXT: @%p1 bra $L__BB219_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB163_1; -; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: @%p2 bra $L__BB219_1; +; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_gpu( +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -7607,10 +10199,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7620,26 +10212,26 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB164_3; +; SM70-NEXT: @%p1 bra $L__BB220_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB164_1; -; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB220_1; +; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic ret i16 %new } @@ -7667,20 +10259,20 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB165_3; +; SM70-NEXT: @%p1 bra $L__BB221_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB165_1; -; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB221_1; +; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7712,20 +10304,20 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB166_3; +; SM70-NEXT: @%p1 bra $L__BB222_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB166_1; -; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB222_1; +; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7757,20 +10349,20 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB167_3; +; SM70-NEXT: @%p1 bra $L__BB223_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB167_1; -; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB223_1; +; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7778,6 +10370,51 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB224_1; +; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_monotonic_i16_shared_sys( ; SM70: { @@ -7802,20 +10439,20 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB168_3; +; SM70-NEXT: @%p1 bra $L__BB225_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB168_1; -; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB225_1; +; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7847,20 +10484,20 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB169_3; +; SM70-NEXT: @%p1 bra $L__BB226_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB169_1; -; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB226_1; +; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7892,20 +10529,20 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB170_3; +; SM70-NEXT: @%p1 bra $L__BB227_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB170_1; -; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB227_1; +; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7913,6 +10550,51 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB228_1; +; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_acquire_i16_generic_sys( ; SM70: { @@ -7937,20 +10619,20 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB171_3; +; SM70-NEXT: @%p1 bra $L__BB229_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB171_1; -; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB229_1; +; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -7982,20 +10664,20 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB172_3; +; SM70-NEXT: @%p1 bra $L__BB230_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB172_1; -; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB230_1; +; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8027,20 +10709,20 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB173_3; +; SM70-NEXT: @%p1 bra $L__BB231_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB173_1; -; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB231_1; +; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8048,6 +10730,51 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB232_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB232_1; +; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_acquire_i16_global_sys( ; SM70: { @@ -8072,20 +10799,20 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB174_3; +; SM70-NEXT: @%p1 bra $L__BB233_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB174_1; -; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB233_1; +; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8117,20 +10844,20 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB175_3; +; SM70-NEXT: @%p1 bra $L__BB234_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB175_1; -; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB234_1; +; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8162,20 +10889,20 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB176_3; +; SM70-NEXT: @%p1 bra $L__BB235_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB176_1; -; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB235_1; +; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8183,6 +10910,51 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB236_1; +; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_acquire_i16_shared_sys( ; SM70: { @@ -8207,20 +10979,20 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB177_3; +; SM70-NEXT: @%p1 bra $L__BB237_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB177_1; -; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB237_1; +; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8252,20 +11024,20 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB178_3; +; SM70-NEXT: @%p1 bra $L__BB238_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB178_1; -; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB238_1; +; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8297,20 +11069,20 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB179_3; +; SM70-NEXT: @%p1 bra $L__BB239_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB179_1; -; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB239_1; +; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8318,6 +11090,52 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB240_1; +; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_seq_cst_i16_generic_sys( ; SM70: { @@ -8343,20 +11161,20 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB180_3; +; SM70-NEXT: @%p1 bra $L__BB241_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB180_1; -; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB241_1; +; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8389,20 +11207,20 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB181_3; +; SM70-NEXT: @%p1 bra $L__BB242_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB181_1; -; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB242_1; +; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8435,20 +11253,20 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB182_3; +; SM70-NEXT: @%p1 bra $L__BB243_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB182_1; -; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB243_1; +; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8456,6 +11274,52 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB244_1; +; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acquire_seq_cst_i16_global_sys( ; SM70: { @@ -8481,29 +11345,75 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB183_3; +; SM70-NEXT: @%p1 bra $L__BB245_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB183_1; -; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB245_1; +; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB246_1; +; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_cta( +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -8511,10 +11421,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8527,29 +11437,29 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB184_3; +; SM70-NEXT: @%p1 bra $L__BB247_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB184_1; -; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: @%p2 bra $L__BB247_1; +; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_gpu( +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -8557,10 +11467,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8571,26 +11481,26 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB185_3; +; SM70-NEXT: @%p1 bra $L__BB248_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB185_1; -; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB248_1; +; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } @@ -8619,20 +11529,20 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB186_3; +; SM70-NEXT: @%p1 bra $L__BB249_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB186_1; -; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB249_1; +; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8665,20 +11575,20 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB187_3; +; SM70-NEXT: @%p1 bra $L__BB250_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB187_1; -; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB250_1; +; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8711,20 +11621,20 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB188_3; +; SM70-NEXT: @%p1 bra $L__BB251_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB188_1; -; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB251_1; +; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -8732,6 +11642,51 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB252_1; +; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_monotonic_i16_generic_sys( ; SM70: { @@ -8757,20 +11712,20 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB189_3; +; SM70-NEXT: @%p1 bra $L__BB253_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB189_1; -; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB253_1; +; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -8802,20 +11757,20 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB190_3; +; SM70-NEXT: @%p1 bra $L__BB254_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB190_1; -; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB254_1; +; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -8847,26 +11802,71 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB191_3; +; SM70-NEXT: @%p1 bra $L__BB255_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB191_1; -; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB255_1; +; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB256_1; +; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_monotonic_i16_global_sys( ; SM70: { @@ -8892,20 +11892,20 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB192_3; +; SM70-NEXT: @%p1 bra $L__BB257_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB192_1; -; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB257_1; +; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -8937,20 +11937,20 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB193_3; +; SM70-NEXT: @%p1 bra $L__BB258_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB193_1; -; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB258_1; +; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -8980,25 +11980,70 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB259_1; +; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB194_3; +; SM70-NEXT: @%p1 bra $L__BB260_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB194_1; -; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB260_1; +; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } @@ -9027,20 +12072,20 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB195_3; +; SM70-NEXT: @%p1 bra $L__BB261_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB195_1; -; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB261_1; +; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -9072,20 +12117,20 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB196_3; +; SM70-NEXT: @%p1 bra $L__BB262_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB196_1; -; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB262_1; +; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -9117,26 +12162,72 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB197_3; +; SM70-NEXT: @%p1 bra $L__BB263_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB197_1; -; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB263_1; +; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB264_1; +; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_acquire_i16_generic_sys( ; SM70: { @@ -9162,20 +12253,20 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB198_3; +; SM70-NEXT: @%p1 bra $L__BB265_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB198_1; -; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB265_1; +; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9208,20 +12299,20 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB199_3; +; SM70-NEXT: @%p1 bra $L__BB266_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB199_1; -; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB266_1; +; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9254,20 +12345,20 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB200_3; +; SM70-NEXT: @%p1 bra $L__BB267_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB200_1; -; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB267_1; +; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9275,6 +12366,52 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB268_1; +; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_acquire_i16_global_sys( ; SM70: { @@ -9300,20 +12437,20 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB201_3; +; SM70-NEXT: @%p1 bra $L__BB269_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB201_1; -; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB269_1; +; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9346,20 +12483,20 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB202_3; +; SM70-NEXT: @%p1 bra $L__BB270_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB202_1; -; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB270_1; +; SM70-NEXT: $L__BB270_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9392,20 +12529,20 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB203_3; +; SM70-NEXT: @%p1 bra $L__BB271_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB203_1; -; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB271_1; +; SM70-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9413,6 +12550,52 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB272_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB272_1; +; SM70-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_acquire_i16_shared_sys( ; SM70: { @@ -9438,20 +12621,20 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB204_3; +; SM70-NEXT: @%p1 bra $L__BB273_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB204_1; -; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB273_1; +; SM70-NEXT: $L__BB273_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9484,20 +12667,20 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB205_3; +; SM70-NEXT: @%p1 bra $L__BB274_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB205_1; -; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB274_1; +; SM70-NEXT: $L__BB274_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9530,24 +12713,70 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB275_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB275_1; +; SM70-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB206_3; +; SM70-NEXT: @%p1 bra $L__BB276_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB206_1; -; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB276_1; +; SM70-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } @@ -9576,20 +12805,20 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB207_3; +; SM70-NEXT: @%p1 bra $L__BB277_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB207_1; -; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB277_1; +; SM70-NEXT: $L__BB277_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9622,20 +12851,20 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB208_3; +; SM70-NEXT: @%p1 bra $L__BB278_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB208_1; -; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB278_1; +; SM70-NEXT: $L__BB278_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9668,20 +12897,20 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB209_3; +; SM70-NEXT: @%p1 bra $L__BB279_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB209_1; -; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB279_1; +; SM70-NEXT: $L__BB279_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9689,6 +12918,52 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB280_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB280_1; +; SM70-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_seq_cst_i16_global_sys( ; SM70: { @@ -9714,20 +12989,20 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB210_3; +; SM70-NEXT: @%p1 bra $L__BB281_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB210_1; -; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB281_1; +; SM70-NEXT: $L__BB281_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9760,20 +13035,20 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB211_3; +; SM70-NEXT: @%p1 bra $L__BB282_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB211_1; -; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB282_1; +; SM70-NEXT: $L__BB282_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9806,20 +13081,20 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB212_3; +; SM70-NEXT: @%p1 bra $L__BB283_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB212_1; -; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB283_1; +; SM70-NEXT: $L__BB283_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9827,6 +13102,52 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB284_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB284_1; +; SM70-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: release_seq_cst_i16_shared_sys( ; SM70: { @@ -9852,20 +13173,20 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB213_3; +; SM70-NEXT: @%p1 bra $L__BB285_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB213_1; -; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB285_1; +; SM70-NEXT: $L__BB285_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9898,20 +13219,20 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB214_3; +; SM70-NEXT: @%p1 bra $L__BB286_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB214_1; -; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB286_1; +; SM70-NEXT: $L__BB286_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9944,20 +13265,20 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB215_3; +; SM70-NEXT: @%p1 bra $L__BB287_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB215_1; -; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB287_1; +; SM70-NEXT: $L__BB287_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -9965,6 +13286,52 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB288_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB288_1; +; SM70-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM70: { @@ -9990,20 +13357,20 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB216_3; +; SM70-NEXT: @%p1 bra $L__BB289_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB216_1; -; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB289_1; +; SM70-NEXT: $L__BB289_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10036,20 +13403,20 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB217_3; +; SM70-NEXT: @%p1 bra $L__BB290_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB217_1; -; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB290_1; +; SM70-NEXT: $L__BB290_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10082,20 +13449,20 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB218_3; +; SM70-NEXT: @%p1 bra $L__BB291_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB218_1; -; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB291_1; +; SM70-NEXT: $L__BB291_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10103,6 +13470,52 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB292_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB292_1; +; SM70-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_monotonic_i16_global_sys( ; SM70: { @@ -10128,20 +13541,20 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB219_3; +; SM70-NEXT: @%p1 bra $L__BB293_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB219_1; -; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB293_1; +; SM70-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10174,20 +13587,20 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB220_3; +; SM70-NEXT: @%p1 bra $L__BB294_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB220_1; -; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB294_1; +; SM70-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10220,24 +13633,70 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB295_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB295_1; +; SM70-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB221_3; +; SM70-NEXT: @%p1 bra $L__BB296_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB221_1; -; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB296_1; +; SM70-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } @@ -10266,20 +13725,20 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB222_3; +; SM70-NEXT: @%p1 bra $L__BB297_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB222_1; -; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB297_1; +; SM70-NEXT: $L__BB297_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10312,20 +13771,20 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB223_3; +; SM70-NEXT: @%p1 bra $L__BB298_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB223_1; -; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB298_1; +; SM70-NEXT: $L__BB298_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10358,20 +13817,20 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: @%p1 bra $L__BB299_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB224_1; -; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB299_1; +; SM70-NEXT: $L__BB299_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10379,6 +13838,52 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB300_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB300_1; +; SM70-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_acquire_i16_generic_sys( ; SM70: { @@ -10404,20 +13909,20 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB225_3; +; SM70-NEXT: @%p1 bra $L__BB301_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB225_1; -; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB301_1; +; SM70-NEXT: $L__BB301_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10450,20 +13955,20 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB226_3; +; SM70-NEXT: @%p1 bra $L__BB302_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB226_1; -; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB302_1; +; SM70-NEXT: $L__BB302_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10496,20 +14001,20 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB227_3; +; SM70-NEXT: @%p1 bra $L__BB303_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB227_1; -; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB303_1; +; SM70-NEXT: $L__BB303_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10517,6 +14022,52 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB304_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB304_1; +; SM70-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_acquire_i16_global_sys( ; SM70: { @@ -10542,20 +14093,20 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: @%p1 bra $L__BB305_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB228_1; -; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB305_1; +; SM70-NEXT: $L__BB305_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10588,20 +14139,20 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB229_3; +; SM70-NEXT: @%p1 bra $L__BB306_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB229_1; -; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB306_1; +; SM70-NEXT: $L__BB306_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10634,20 +14185,20 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB230_3; +; SM70-NEXT: @%p1 bra $L__BB307_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB230_1; -; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB307_1; +; SM70-NEXT: $L__BB307_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10655,6 +14206,52 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB308_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB308_1; +; SM70-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_acquire_i16_shared_sys( ; SM70: { @@ -10680,20 +14277,20 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB231_3; +; SM70-NEXT: @%p1 bra $L__BB309_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB231_1; -; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB309_1; +; SM70-NEXT: $L__BB309_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10726,20 +14323,20 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB232_3; +; SM70-NEXT: @%p1 bra $L__BB310_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB232_1; -; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB310_1; +; SM70-NEXT: $L__BB310_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10772,20 +14369,20 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB233_3; +; SM70-NEXT: @%p1 bra $L__BB311_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB233_1; -; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB311_1; +; SM70-NEXT: $L__BB311_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10793,6 +14390,52 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB312_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB312_1; +; SM70-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM70: { @@ -10818,20 +14461,20 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB234_3; +; SM70-NEXT: @%p1 bra $L__BB313_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB234_1; -; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB313_1; +; SM70-NEXT: $L__BB313_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10864,20 +14507,20 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB235_3; +; SM70-NEXT: @%p1 bra $L__BB314_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB235_1; -; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB314_1; +; SM70-NEXT: $L__BB314_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -10908,26 +14551,72 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB315_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB315_1; +; SM70-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: @%p1 bra $L__BB316_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB236_1; -; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB316_1; +; SM70-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst ret i16 %new } @@ -10956,20 +14645,20 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB237_3; +; SM70-NEXT: @%p1 bra $L__BB317_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB237_1; -; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB317_1; +; SM70-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11002,20 +14691,20 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB238_3; +; SM70-NEXT: @%p1 bra $L__BB318_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB238_1; -; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB318_1; +; SM70-NEXT: $L__BB318_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11048,20 +14737,20 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB239_3; +; SM70-NEXT: @%p1 bra $L__BB319_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB239_1; -; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB319_1; +; SM70-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11069,6 +14758,52 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB320_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB320_1; +; SM70-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM70: { @@ -11094,20 +14829,20 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: @%p1 bra $L__BB321_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB240_1; -; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB321_1; +; SM70-NEXT: $L__BB321_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11140,20 +14875,20 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB241_3; +; SM70-NEXT: @%p1 bra $L__BB322_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB241_1; -; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB322_1; +; SM70-NEXT: $L__BB322_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11186,20 +14921,20 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB242_3; +; SM70-NEXT: @%p1 bra $L__BB323_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB242_1; -; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB323_1; +; SM70-NEXT: $L__BB323_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11207,6 +14942,52 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB324_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB324_1; +; SM70-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( ; SM70: { @@ -11232,20 +15013,20 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB243_3; +; SM70-NEXT: @%p1 bra $L__BB325_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB243_1; -; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB325_1; +; SM70-NEXT: $L__BB325_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11278,20 +15059,20 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: @%p1 bra $L__BB326_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB244_1; -; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB326_1; +; SM70-NEXT: $L__BB326_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11324,20 +15105,20 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB245_3; +; SM70-NEXT: @%p1 bra $L__BB327_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB245_1; -; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB327_1; +; SM70-NEXT: $L__BB327_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11345,6 +15126,52 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB328_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB328_1; +; SM70-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_monotonic_i16_global_sys( ; SM70: { @@ -11370,20 +15197,20 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: @%p1 bra $L__BB329_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB246_1; -; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB329_1; +; SM70-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11416,20 +15243,20 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB247_3; +; SM70-NEXT: @%p1 bra $L__BB330_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB247_1; -; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB330_1; +; SM70-NEXT: $L__BB330_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11462,20 +15289,20 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB248_3; +; SM70-NEXT: @%p1 bra $L__BB331_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB248_1; -; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB331_1; +; SM70-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11483,6 +15310,52 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB332_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB332_1; +; SM70-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM70: { @@ -11508,20 +15381,20 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB249_3; +; SM70-NEXT: @%p1 bra $L__BB333_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB249_1; -; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB333_1; +; SM70-NEXT: $L__BB333_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11554,20 +15427,20 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB250_3; +; SM70-NEXT: @%p1 bra $L__BB334_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB250_1; -; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB334_1; +; SM70-NEXT: $L__BB334_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11598,26 +15471,72 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB335_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB335_1; +; SM70-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB251_3; +; SM70-NEXT: @%p1 bra $L__BB336_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB251_1; -; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB336_1; +; SM70-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } @@ -11646,20 +15565,20 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: @%p1 bra $L__BB337_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB252_1; -; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB337_1; +; SM70-NEXT: $L__BB337_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11692,20 +15611,20 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB253_3; +; SM70-NEXT: @%p1 bra $L__BB338_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB253_1; -; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB338_1; +; SM70-NEXT: $L__BB338_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11738,20 +15657,20 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB254_3; +; SM70-NEXT: @%p1 bra $L__BB339_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB254_1; -; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB339_1; +; SM70-NEXT: $L__BB339_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11759,6 +15678,52 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB340_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB340_1; +; SM70-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_acquire_i16_global_sys( ; SM70: { @@ -11784,20 +15749,20 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB255_3; +; SM70-NEXT: @%p1 bra $L__BB341_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB255_1; -; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB341_1; +; SM70-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11830,20 +15795,20 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: @%p1 bra $L__BB342_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB256_1; -; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB342_1; +; SM70-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11876,20 +15841,20 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB257_3; +; SM70-NEXT: @%p1 bra $L__BB343_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB257_1; -; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB343_1; +; SM70-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11897,6 +15862,52 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB344_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB344_1; +; SM70-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_acquire_i16_shared_sys( ; SM70: { @@ -11922,20 +15933,20 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB258_3; +; SM70-NEXT: @%p1 bra $L__BB345_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB258_1; -; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB345_1; +; SM70-NEXT: $L__BB345_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -11968,20 +15979,20 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: @%p1 bra $L__BB346_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB259_1; -; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB346_1; +; SM70-NEXT: $L__BB346_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12014,20 +16025,20 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB260_3; +; SM70-NEXT: @%p1 bra $L__BB347_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB260_1; -; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB347_1; +; SM70-NEXT: $L__BB347_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12035,6 +16046,52 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_generic( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB348_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB348_1; +; SM70-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM70: { @@ -12060,20 +16117,20 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB261_3; +; SM70-NEXT: @%p1 bra $L__BB349_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB261_1; -; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB349_1; +; SM70-NEXT: $L__BB349_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12106,20 +16163,20 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB262_3; +; SM70-NEXT: @%p1 bra $L__BB350_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB262_1; -; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB350_1; +; SM70-NEXT: $L__BB350_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12152,20 +16209,20 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB263_3; +; SM70-NEXT: @%p1 bra $L__BB351_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB263_1; -; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB351_1; +; SM70-NEXT: $L__BB351_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12173,6 +16230,52 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB352_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB352_1; +; SM70-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM70: { @@ -12198,20 +16301,20 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: @%p1 bra $L__BB353_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB264_1; -; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB353_1; +; SM70-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12244,20 +16347,20 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB265_3; +; SM70-NEXT: @%p1 bra $L__BB354_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB265_1; -; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB354_1; +; SM70-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12288,26 +16391,72 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB355_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB355_1; +; SM70-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_shared( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB266_3; +; SM70-NEXT: @%p1 bra $L__BB356_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB266_1; -; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: @%p2 bra $L__BB356_1; +; SM70-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -12336,20 +16485,20 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB267_3; +; SM70-NEXT: @%p1 bra $L__BB357_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB267_1; -; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB357_1; +; SM70-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12382,20 +16531,20 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: @%p1 bra $L__BB358_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB268_1; -; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB358_1; +; SM70-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12428,20 +16577,20 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB269_3; +; SM70-NEXT: @%p1 bra $L__BB359_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB269_1; -; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM70-NEXT: @%p2 bra $L__BB359_1; +; SM70-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -12449,6 +16598,23 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_generic_sys( ; SM70: { @@ -12500,6 +16666,23 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_global_sys( ; SM70: { @@ -12551,6 +16734,23 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_shared_sys( ; SM70: { @@ -12602,6 +16802,23 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_generic_sys( ; SM70: { @@ -12653,6 +16870,23 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_global_sys( ; SM70: { @@ -12704,6 +16938,23 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_shared_sys( ; SM70: { @@ -12755,6 +17006,24 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM70: { @@ -12809,6 +17078,24 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_global_sys( ; SM70: { @@ -12863,6 +17150,24 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM70: { @@ -12917,6 +17222,23 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_generic_sys( ; SM70: { @@ -12968,6 +17290,23 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_global_sys( ; SM70: { @@ -13019,6 +17358,23 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_shared_sys( ; SM70: { @@ -13070,6 +17426,23 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_generic_sys( ; SM70: { @@ -13121,6 +17494,23 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_global_sys( ; SM70: { @@ -13172,6 +17562,23 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_shared_sys( ; SM70: { @@ -13223,6 +17630,24 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_generic_sys( ; SM70: { @@ -13277,6 +17702,24 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_global_sys( ; SM70: { @@ -13331,6 +17774,24 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_shared_sys( ; SM70: { @@ -13385,6 +17846,23 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_monotonic_i32_generic_sys( ; SM70: { @@ -13432,7 +17910,24 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic ret i32 %new } @@ -13487,6 +17982,23 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_monotonic_i32_shared_sys( ; SM70: { @@ -13538,6 +18050,23 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_generic_sys( ; SM70: { @@ -13589,6 +18118,23 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_global_sys( ; SM70: { @@ -13640,6 +18186,23 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_shared_sys( ; SM70: { @@ -13691,6 +18254,24 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_generic_sys( ; SM70: { @@ -13745,6 +18326,24 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_global_sys( ; SM70: { @@ -13799,6 +18398,24 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_shared_sys( ; SM70: { @@ -13853,6 +18470,23 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM70: { @@ -13904,6 +18538,23 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_global_sys( ; SM70: { @@ -13955,6 +18606,23 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM70: { @@ -14006,6 +18674,23 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_generic_sys( ; SM70: { @@ -14057,6 +18742,23 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_global_sys( ; SM70: { @@ -14108,6 +18810,23 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_shared_sys( ; SM70: { @@ -14159,6 +18878,24 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM70: { @@ -14213,6 +18950,24 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM70: { @@ -14267,6 +19022,24 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM70: { @@ -14321,6 +19094,24 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM70: { @@ -14375,6 +19166,24 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_global_sys( ; SM70: { @@ -14429,6 +19238,24 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM70: { @@ -14483,6 +19310,24 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_acquire_i32_generic_sys( ; SM70: { @@ -14537,6 +19382,24 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_acquire_i32_global_sys( ; SM70: { @@ -14587,7 +19450,25 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire ret i32 %new } @@ -14645,6 +19526,24 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_generic( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM70: { @@ -14699,6 +19598,24 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM70: { @@ -14753,6 +19670,24 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_shared( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM70: { @@ -14807,6 +19742,22 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_generic_sys( ; SM70: { @@ -14855,6 +19806,22 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_global_sys( ; SM70: { @@ -14903,6 +19870,22 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_shared_sys( ; SM70: { @@ -14951,6 +19934,22 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_generic_sys( ; SM70: { @@ -14999,6 +19998,22 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_global_sys( ; SM70: { @@ -15047,6 +20062,22 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_shared_sys( ; SM70: { @@ -15095,6 +20126,23 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM70: { @@ -15146,6 +20194,23 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_global_sys( ; SM70: { @@ -15197,6 +20262,23 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM70: { @@ -15248,6 +20330,22 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_generic_sys( ; SM70: { @@ -15296,6 +20394,22 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_global_sys( ; SM70: { @@ -15344,6 +20458,22 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_shared_sys( ; SM70: { @@ -15392,6 +20522,22 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_generic_sys( ; SM70: { @@ -15440,6 +20586,22 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_global_sys( ; SM70: { @@ -15488,6 +20650,22 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_shared_sys( ; SM70: { @@ -15536,6 +20714,23 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_generic_sys( ; SM70: { @@ -15587,6 +20782,23 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_global_sys( ; SM70: { @@ -15638,6 +20850,23 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_shared_sys( ; SM70: { @@ -15689,6 +20918,22 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_monotonic_i64_generic_sys( ; SM70: { @@ -15737,6 +20982,22 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_monotonic_i64_global_sys( ; SM70: { @@ -15781,7 +21042,23 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic ret i64 %new } @@ -15833,6 +21110,22 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_generic_sys( ; SM70: { @@ -15881,6 +21174,22 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_global_sys( ; SM70: { @@ -15929,6 +21238,22 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_shared_sys( ; SM70: { @@ -15977,6 +21302,23 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_generic_sys( ; SM70: { @@ -16028,6 +21370,23 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_global_sys( ; SM70: { @@ -16079,6 +21438,23 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_shared_sys( ; SM70: { @@ -16130,6 +21506,22 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM70: { @@ -16178,6 +21570,22 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_global_sys( ; SM70: { @@ -16226,6 +21634,22 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM70: { @@ -16274,6 +21698,22 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_generic_sys( ; SM70: { @@ -16322,6 +21762,22 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_global_sys( ; SM70: { @@ -16370,6 +21826,22 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_shared_sys( ; SM70: { @@ -16418,6 +21890,23 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM70: { @@ -16469,6 +21958,23 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM70: { @@ -16520,6 +22026,23 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM70: { @@ -16571,6 +22094,23 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM70: { @@ -16622,6 +22162,23 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_global_sys( ; SM70: { @@ -16673,6 +22230,23 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM70: { @@ -16724,6 +22298,23 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_generic_sys( ; SM70: { @@ -16775,6 +22366,23 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_global_sys( ; SM70: { @@ -16826,6 +22434,23 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_shared_sys( ; SM70: { @@ -16877,6 +22502,23 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_generic( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM70: { @@ -16928,6 +22570,23 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_global( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM70: { @@ -16979,6 +22638,23 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_seq_cst_i64_shared( +; SM70: { +; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM70: { diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 68658255ad5af..33366ae25379b 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,7 +68,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,7 +113,7 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,7 +158,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -248,7 +248,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cluster( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -293,7 +293,7 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -338,7 +338,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -448,12 +448,12 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -461,8 +461,8 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -473,7 +473,7 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -493,12 +493,12 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -506,8 +506,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -518,7 +518,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -538,12 +538,12 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -551,8 +551,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -563,9 +563,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -581,15 +581,14 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -597,8 +596,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -609,9 +608,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -627,15 +626,14 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cluster( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -643,8 +641,8 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -655,9 +653,9 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -673,15 +671,14 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -689,8 +686,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -701,7 +698,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -719,15 +716,15 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -735,8 +732,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -747,9 +744,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -768,12 +765,12 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -781,8 +778,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -793,9 +790,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -814,12 +811,12 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cluster( +define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -827,8 +824,8 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -839,9 +836,9 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -860,12 +857,12 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -873,8 +870,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -885,9 +882,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -906,12 +903,12 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -919,8 +916,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -931,9 +928,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -952,12 +949,12 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -965,9 +962,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; ; SM90-NEXT: shl.b32 %r1, %r10, 3; @@ -977,9 +974,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -995,15 +992,15 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cluster( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1011,8 +1008,8 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1023,9 +1020,9 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1041,15 +1038,15 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1057,8 +1054,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1069,9 +1066,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1087,15 +1084,15 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1103,9 +1100,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1116,9 +1112,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1134,15 +1130,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1150,9 +1146,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1163,9 +1158,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1181,15 +1176,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1197,9 +1192,8 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1210,9 +1204,9 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1228,15 +1222,15 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1244,9 +1238,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1257,9 +1250,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1275,15 +1268,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1291,9 +1284,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1304,9 +1296,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1322,15 +1314,15 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1338,9 +1330,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1351,9 +1342,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1369,15 +1360,15 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1385,9 +1376,9 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1398,9 +1389,9 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1416,15 +1407,15 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1432,9 +1423,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1445,9 +1436,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1463,15 +1454,15 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1479,9 +1470,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1492,9 +1483,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1510,15 +1501,15 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1526,9 +1517,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1539,9 +1530,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1557,15 +1548,15 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1573,9 +1564,9 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1586,9 +1577,9 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1604,15 +1595,15 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1620,9 +1611,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1633,9 +1624,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1651,15 +1642,15 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1658,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1671,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1700,12 +1692,12 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1705,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1718,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1746,12 +1739,12 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cluster( +define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1752,9 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1765,9 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1792,12 +1786,12 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1799,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1812,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1838,12 +1833,12 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_sys( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1846,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1859,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1884,12 +1880,12 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cta( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1893,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1906,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1927,15 +1924,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cluster( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1940,9 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1953,9 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1971,15 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1987,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2000,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2018,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2034,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2047,7 +2047,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2065,15 +2065,15 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2081,8 +2081,8 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2093,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2111,15 +2111,15 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cluster( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2127,8 +2127,8 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2139,9 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2157,15 +2157,15 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2173,8 +2173,8 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2185,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2203,15 +2203,15 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2219,8 +2219,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2231,7 +2231,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2249,15 +2249,15 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2265,8 +2265,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2277,7 +2277,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2295,15 +2295,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cluster( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2311,8 +2311,8 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2323,9 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2341,15 +2341,15 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2357,8 +2357,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2369,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2387,15 +2387,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2403,8 +2403,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2415,7 +2415,7 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2433,15 +2433,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cta( +define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2449,8 +2449,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2461,7 +2461,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2479,15 +2479,15 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cluster( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2495,8 +2495,8 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2507,7 +2507,7 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2525,15 +2525,15 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_gpu( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2541,8 +2541,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2553,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2571,15 +2571,15 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2587,8 +2587,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2599,7 +2599,7 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2620,12 +2620,12 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2633,8 +2633,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2645,7 +2645,7 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2666,12 +2666,12 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cluster( +define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2679,8 +2679,8 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2691,7 +2691,7 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2712,12 +2712,12 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2725,8 +2725,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2737,7 +2737,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2758,12 +2758,12 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2771,9 +2771,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2784,7 +2783,7 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2805,12 +2804,12 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2818,9 +2817,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2831,7 +2829,7 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2849,15 +2847,15 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2865,9 +2863,8 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2878,7 +2875,7 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2896,15 +2893,15 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2912,9 +2909,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2925,7 +2921,7 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -2943,15 +2939,15 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2959,9 +2955,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2972,9 +2967,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2990,15 +2985,15 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3006,9 +3001,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3019,7 +3013,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3037,15 +3031,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cluster( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3053,9 +3047,8 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3066,7 +3059,7 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3084,15 +3077,15 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3100,9 +3093,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3113,7 +3105,7 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3131,15 +3123,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3147,9 +3139,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3160,9 +3151,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3178,15 +3169,15 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3194,9 +3185,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3207,9 +3197,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3225,15 +3215,15 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3241,9 +3231,8 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3254,7 +3243,7 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3272,15 +3261,15 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3288,9 +3277,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3301,7 +3289,7 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3319,15 +3307,15 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3335,9 +3323,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3348,9 +3335,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3366,14 +3353,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3381,9 +3369,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3394,9 +3381,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3412,14 +3399,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cluster( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3427,9 +3415,8 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3440,9 +3427,9 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3458,14 +3445,15 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3473,9 +3461,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3486,7 +3474,7 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -3504,14 +3492,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_sys( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3519,9 +3508,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3532,9 +3521,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3550,14 +3539,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cta( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3565,9 +3555,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3578,9 +3568,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3596,14 +3586,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cluster( +define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3611,9 +3602,9 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3624,9 +3615,9 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3642,14 +3633,15 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_gpu( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3657,9 +3649,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3670,9 +3662,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3688,14 +3680,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3703,9 +3696,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3716,9 +3709,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3734,14 +3727,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3749,9 +3743,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3762,9 +3756,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3780,14 +3774,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cluster( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3795,9 +3790,9 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3808,9 +3803,9 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3826,14 +3821,15 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3841,9 +3837,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3854,9 +3850,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3872,14 +3868,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3887,9 +3884,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3900,9 +3897,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3918,15 +3915,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3934,9 +3931,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3947,9 +3944,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3965,15 +3962,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cluster( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3981,9 +3978,9 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3994,9 +3991,9 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4012,15 +4009,15 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4028,9 +4025,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4041,9 +4038,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4059,15 +4056,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4075,9 +4072,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4088,9 +4085,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4106,15 +4103,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cta( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4122,9 +4119,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4135,9 +4132,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4153,15 +4150,15 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cluster( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4169,9 +4166,9 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4182,15 +4179,15 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB90_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4200,15 +4197,14 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB90_1; ; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_gpu( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4216,9 +4212,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4229,15 +4225,15 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB91_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4247,15 +4243,14 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB91_1; ; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_sys( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4263,9 +4258,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4276,15 +4271,15 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB92_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4294,15 +4289,14 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB92_1; ; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cta( +define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4310,9 +4304,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4323,15 +4317,15 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB93_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4341,15 +4335,14 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB93_1; ; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cluster( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4357,9 +4350,9 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4370,15 +4363,15 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB94_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4388,15 +4381,14 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB94_1; ; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_gpu( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4404,9 +4396,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4417,15 +4409,15 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB95_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4435,15 +4427,14 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB95_1; ; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4451,9 +4442,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4464,15 +4455,15 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB96_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4482,15 +4473,14 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB96_1; ; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4498,9 +4488,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4511,15 +4501,15 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB97_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4529,15 +4519,14 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB97_1; ; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cluster( +define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4545,9 +4534,9 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4558,15 +4547,15 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB98_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4576,15 +4565,14 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB98_1; ; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4592,9 +4580,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4605,15 +4593,15 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB99_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4623,15 +4611,14 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB99_1; ; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_sys( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4639,9 +4626,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4652,15 +4639,15 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB100_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4670,15 +4657,14 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB100_1; ; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cta( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4686,9 +4672,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4699,15 +4685,15 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB101_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4717,15 +4703,14 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB101_1; ; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cluster( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4733,9 +4718,9 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4746,15 +4731,15 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB102_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4764,15 +4749,14 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB102_1; ; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_gpu( +define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4780,9 +4764,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4793,15 +4777,15 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB103_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4811,15 +4795,14 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB103_1; ; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_sys( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4827,9 +4810,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4840,7 +4823,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -4848,7 +4831,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB104_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4858,15 +4841,14 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB104_1; ; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cta( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4874,9 +4856,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4887,15 +4869,15 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB105_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4905,15 +4887,15 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB105_1; ; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cluster( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4921,9 +4903,9 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4934,15 +4916,15 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB106_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4952,15 +4934,15 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB106_1; ; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4968,9 +4950,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4981,15 +4963,15 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB107_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -4999,15 +4981,15 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB107_1; ; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5015,9 +4997,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5028,7 +5010,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5036,7 +5018,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB108_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5046,15 +5028,15 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB108_1; ; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5062,9 +5044,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5075,7 +5057,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5083,7 +5065,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB109_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5093,15 +5075,15 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB109_1; ; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5109,9 +5091,9 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5122,15 +5104,15 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB110_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5140,15 +5122,15 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB110_1; ; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5156,9 +5138,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5169,15 +5151,15 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB111_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5187,15 +5169,15 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB111_1; ; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5203,9 +5185,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5216,7 +5198,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5224,7 +5206,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB112_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5234,15 +5216,15 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB112_1; ; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5250,9 +5232,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5263,7 +5245,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5271,7 +5253,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB113_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5281,15 +5263,15 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB113_1; ; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5297,9 +5279,9 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5310,7 +5292,7 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5318,7 +5300,7 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB114_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5328,15 +5310,15 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB114_1; ; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5344,9 +5326,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5357,15 +5339,15 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB115_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5375,15 +5357,15 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB115_1; ; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5391,8 +5373,8 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5404,7 +5386,7 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5425,12 +5407,12 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5438,8 +5420,8 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5451,7 +5433,7 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5472,12 +5454,12 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( +define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5485,8 +5467,8 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5498,7 +5480,7 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5519,12 +5501,12 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5532,8 +5514,8 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5545,7 +5527,7 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5566,12 +5548,12 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5579,9 +5561,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5592,7 +5574,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5613,12 +5595,12 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5626,9 +5608,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5639,7 +5621,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5647,7 +5629,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB121_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5657,15 +5639,15 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB121_1; ; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5673,9 +5655,9 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5686,7 +5668,7 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5694,7 +5676,7 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB122_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5704,15 +5686,15 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB122_1; ; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5720,9 +5702,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5733,7 +5715,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5741,7 +5723,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB123_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5751,15 +5733,15 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB123_1; ; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5767,9 +5749,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5780,15 +5762,15 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB124_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5798,15 +5780,15 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB124_1; ; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5814,9 +5796,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5827,7 +5809,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5835,7 +5817,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB125_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5845,15 +5827,15 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB125_1; ; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cluster( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5861,9 +5843,9 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5874,7 +5856,7 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5882,7 +5864,7 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB126_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5892,15 +5874,15 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB126_1; ; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5908,9 +5890,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5921,7 +5903,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -5929,7 +5911,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB127_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5939,15 +5921,15 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB127_1; ; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5955,9 +5937,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5968,15 +5950,15 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB128_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -5986,15 +5968,15 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB128_1; ; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6002,9 +5984,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6015,15 +5997,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB129_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6033,15 +6015,15 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB129_1; ; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6049,9 +6031,9 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6062,7 +6044,7 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -6070,7 +6052,7 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB130_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6080,15 +6062,15 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB130_1; ; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6096,9 +6078,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6109,7 +6091,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -6117,7 +6099,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB131_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6127,15 +6109,15 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB131_1; ; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6143,9 +6125,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6156,15 +6138,15 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB132_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6174,15 +6156,15 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB132_1; ; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6190,9 +6172,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6203,15 +6185,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB133_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6221,15 +6203,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB133_1; ; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6237,9 +6219,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6250,15 +6232,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB134_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6268,15 +6250,15 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB134_1; ; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6284,9 +6266,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6297,7 +6279,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -6305,7 +6287,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB135_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6315,15 +6297,15 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB135_1; ; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6331,9 +6313,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6344,15 +6326,15 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB136_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6365,12 +6347,12 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6378,9 +6360,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6391,15 +6373,15 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB137_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6412,12 +6394,12 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( +define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6425,9 +6407,9 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6438,15 +6420,15 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB138_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6459,12 +6441,12 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6472,9 +6454,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6485,15 +6467,15 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB139_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6506,12 +6488,12 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6519,9 +6501,9 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6532,15 +6514,15 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB140_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6553,12 +6535,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6566,9 +6548,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6579,15 +6561,15 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB141_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6597,15 +6579,15 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB141_1; ; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6613,9 +6595,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6626,15 +6608,15 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB142_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6644,15 +6626,15 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB142_1; ; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6660,9 +6642,9 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6673,15 +6655,15 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB143_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6691,15 +6673,15 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB143_1; ; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6707,9 +6689,9 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6720,15 +6702,15 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB144_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6738,15 +6720,15 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB144_1; ; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6754,9 +6736,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6767,15 +6749,15 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB145_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6785,15 +6767,15 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB145_1; ; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6801,9 +6783,9 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6814,15 +6796,15 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB146_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6832,15 +6814,15 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB146_1; ; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6848,9 +6830,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6861,15 +6843,15 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB147_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6879,15 +6861,15 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB147_1; ; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_sys( +define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6895,9 +6877,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6908,15 +6890,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB148_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6926,15 +6908,15 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB148_1; ; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cta( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6942,9 +6924,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6955,15 +6937,15 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB149_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -6973,15 +6955,15 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB149_1; ; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6989,9 +6971,9 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7002,15 +6984,15 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB150_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7020,15 +7002,15 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB150_1; ; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7036,9 +7018,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7049,15 +7031,15 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB151_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7067,15 +7049,15 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB151_1; ; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7083,9 +7065,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7096,15 +7078,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB152_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7114,15 +7096,15 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB152_1; ; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( +define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7130,9 +7112,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7143,15 +7125,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB153_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7161,15 +7143,15 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB153_1; ; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7177,9 +7159,9 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7190,15 +7172,15 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB154_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7208,15 +7190,15 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB154_1; ; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7224,9 +7206,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7237,15 +7219,15 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB155_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7255,15 +7237,15 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB155_1; ; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_sys( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7271,9 +7253,9 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7284,15 +7266,15 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB156_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7305,12 +7287,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cta( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7318,9 +7300,9 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7331,15 +7313,15 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB157_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7352,12 +7334,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( +define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7365,9 +7347,9 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7378,15 +7360,15 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB158_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7399,12 +7381,12 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7412,9 +7394,9 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7425,15 +7407,15 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB159_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7446,12 +7428,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_sys( +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7459,9 +7441,9 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7472,15 +7454,15 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB160_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7493,12 +7475,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7506,9 +7488,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7519,15 +7501,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB161_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7537,15 +7519,15 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB161_1; ; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cluster( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7553,9 +7535,9 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7566,15 +7548,15 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB162_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7584,15 +7566,15 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB162_1; ; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7600,9 +7582,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7613,15 +7595,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB163_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7631,15 +7613,15 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB163_1; ; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7647,9 +7629,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7660,7 +7642,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -7668,7 +7650,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB164_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7678,15 +7660,15 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB164_1; ; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7694,9 +7676,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7707,15 +7689,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB165_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7725,15 +7707,15 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB165_1; ; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7741,9 +7723,9 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7754,15 +7736,15 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB166_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7772,15 +7754,15 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB166_1; ; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7788,9 +7770,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7801,15 +7783,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB167_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7819,15 +7801,15 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB167_1; ; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( +define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7835,9 +7817,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7848,7 +7830,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -7856,7 +7838,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB168_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7866,15 +7848,15 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB168_1; ; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7882,9 +7864,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7895,7 +7877,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -7903,7 +7885,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB169_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7913,15 +7895,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB169_1; ; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7929,9 +7911,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7942,15 +7924,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB170_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -7960,15 +7942,15 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB170_1; ; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7976,9 +7958,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7989,15 +7971,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB171_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -8007,15 +7989,15 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB171_1; ; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8023,9 +8005,9 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8036,7 +8018,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8044,7 +8026,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB172_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -8054,15 +8036,15 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB172_1; ; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( +define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8070,9 +8052,9 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8083,7 +8065,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8091,7 +8073,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB173_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -8101,15 +8083,15 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB173_1; ; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8117,9 +8099,9 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8130,7 +8112,7 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8138,7 +8120,7 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB174_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -8148,15 +8130,15 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB174_1; ; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8164,9 +8146,9 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8177,15 +8159,15 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB175_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -8195,15 +8177,15 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB175_1; ; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8211,8 +8193,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8224,7 +8206,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8245,12 +8227,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8258,8 +8240,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8271,7 +8253,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8292,12 +8274,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( +define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8305,8 +8287,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8318,7 +8300,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; @@ -8339,12 +8321,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8352,8 +8334,2123 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB179_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB179_1; +; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB180_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB180_1; +; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB181_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB181_1; +; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB182_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB182_1; +; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB183_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB183_1; +; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB184_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB184_1; +; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB185_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB185_1; +; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB186_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB186_1; +; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB187_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB187_1; +; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB188_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB188_1; +; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB189_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB189_1; +; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB190_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB190_1; +; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB191_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB191_1; +; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB192_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB192_1; +; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB193_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB193_1; +; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB194_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB194_1; +; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB195_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB195_1; +; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB196_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB196_1; +; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB197_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB197_1; +; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB198_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB198_1; +; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB199_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB199_1; +; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB200_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB200_1; +; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB201_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB201_1; +; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB202_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB202_1; +; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB203_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB203_1; +; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB204_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB204_1; +; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB205_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB205_1; +; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB206_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB206_1; +; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB207_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB207_1; +; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB208_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB208_1; +; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB209_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB209_1; +; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB210_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB210_1; +; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB211_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB211_1; +; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB212_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB212_1; +; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB213_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB213_1; +; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB214_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB214_1; +; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB215_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB215_1; +; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB216_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB216_1; +; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB217_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB217_1; +; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB218_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB218_1; +; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB219_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB219_1; +; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB220_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB220_1; +; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB221_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB221_1; +; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB222_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB222_1; +; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r9, %rd2; +; SM90-NEXT: and.b32 %r10, %r9, 3; +; SM90-NEXT: shl.b32 %r1, %r10, 3; +; SM90-NEXT: mov.b32 %r11, 255; +; SM90-NEXT: shl.b32 %r12, %r11, %r1; +; SM90-NEXT: not.b32 %r2, %r12; +; SM90-NEXT: cvt.u32.u16 %r13, %rs1; +; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB223_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB223_1; +; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8369,20 +10466,20 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r4, %r15, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB179_3; +; SM90-NEXT: @%p1 bra $L__BB224_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB179_1; -; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB224_1; +; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; @@ -8390,6 +10487,50 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ret i8 %new } +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB225_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB225_1; +; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_monotonic_i16_generic_sys( ; SM90: { @@ -8414,20 +10555,20 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB180_3; +; SM90-NEXT: @%p1 bra $L__BB226_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB180_1; -; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB226_1; +; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -8458,20 +10599,20 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB181_3; +; SM90-NEXT: @%p1 bra $L__BB227_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB181_1; -; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB227_1; +; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -8502,28 +10643,72 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB182_3; +; SM90-NEXT: @%p1 bra $L__BB228_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB228_1; +; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB182_1; -; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB229_1; +; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8531,10 +10716,10 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8544,25 +10729,25 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB183_3; +; SM90-NEXT: @%p1 bra $L__BB230_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB183_1; -; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB230_1; +; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic ret i16 %new } @@ -8590,20 +10775,20 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB184_3; +; SM90-NEXT: @%p1 bra $L__BB231_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB184_1; -; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB231_1; +; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -8634,20 +10819,20 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB185_3; +; SM90-NEXT: @%p1 bra $L__BB232_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB185_1; -; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB232_1; +; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -8678,20 +10863,20 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB186_3; +; SM90-NEXT: @%p1 bra $L__BB233_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB186_1; -; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB233_1; +; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic @@ -8722,26 +10907,70 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB187_3; +; SM90-NEXT: @%p1 bra $L__BB234_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB187_1; -; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB234_1; +; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB235_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB235_1; +; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_monotonic_i16_shared_sys( ; SM90: { @@ -8766,20 +10995,20 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB188_3; +; SM90-NEXT: @%p1 bra $L__BB236_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB188_1; -; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB236_1; +; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -8810,20 +11039,20 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB189_3; +; SM90-NEXT: @%p1 bra $L__BB237_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB189_1; -; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB237_1; +; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -8854,20 +11083,20 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB190_3; +; SM90-NEXT: @%p1 bra $L__BB238_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB190_1; -; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB238_1; +; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic @@ -8898,26 +11127,71 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB191_3; +; SM90-NEXT: @%p1 bra $L__BB239_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB191_1; -; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB239_1; +; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB240_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB240_1; +; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_acquire_i16_generic_sys( ; SM90: { @@ -8942,20 +11216,20 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB192_3; +; SM90-NEXT: @%p1 bra $L__BB241_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB192_1; -; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB241_1; +; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -8987,20 +11261,20 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB193_3; +; SM90-NEXT: @%p1 bra $L__BB242_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB193_1; -; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB242_1; +; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9032,20 +11306,20 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB194_3; +; SM90-NEXT: @%p1 bra $L__BB243_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB194_1; -; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB243_1; +; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9062,10 +11336,55 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB244_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB244_1; +; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9075,26 +11394,26 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB195_3; +; SM90-NEXT: @%p1 bra $L__BB245_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB195_1; -; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB245_1; +; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire ret i16 %new } @@ -9122,20 +11441,20 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB196_3; +; SM90-NEXT: @%p1 bra $L__BB246_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB196_1; -; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB246_1; +; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9167,20 +11486,20 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB197_3; +; SM90-NEXT: @%p1 bra $L__BB247_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB197_1; -; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB247_1; +; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9212,20 +11531,20 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB198_3; +; SM90-NEXT: @%p1 bra $L__BB248_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB198_1; -; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB248_1; +; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9257,20 +11576,20 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB199_3; +; SM90-NEXT: @%p1 bra $L__BB249_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB199_1; -; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB249_1; +; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9278,6 +11597,51 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB250_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB250_1; +; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_acquire_i16_shared_sys( ; SM90: { @@ -9302,20 +11666,20 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB200_3; +; SM90-NEXT: @%p1 bra $L__BB251_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB200_1; -; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB251_1; +; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9347,20 +11711,20 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB201_3; +; SM90-NEXT: @%p1 bra $L__BB252_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB201_1; -; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB252_1; +; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9392,20 +11756,20 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB202_3; +; SM90-NEXT: @%p1 bra $L__BB253_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB202_1; -; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB253_1; +; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9437,20 +11801,20 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB203_3; +; SM90-NEXT: @%p1 bra $L__BB254_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB203_1; -; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB254_1; +; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9458,6 +11822,52 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB255_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB255_1; +; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM90: { @@ -9483,20 +11893,20 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB204_3; +; SM90-NEXT: @%p1 bra $L__BB256_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB204_1; -; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB256_1; +; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9529,20 +11939,20 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB205_3; +; SM90-NEXT: @%p1 bra $L__BB257_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB205_1; -; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB257_1; +; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9575,20 +11985,20 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB206_3; +; SM90-NEXT: @%p1 bra $L__BB258_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB206_1; -; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB258_1; +; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9596,8 +12006,54 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ret i16 %new } -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB259_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB259_1; +; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -9605,10 +12061,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9619,26 +12075,26 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB207_3; +; SM90-NEXT: @%p1 bra $L__BB260_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB207_1; -; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB260_1; +; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst ret i16 %new } @@ -9667,20 +12123,20 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB208_3; +; SM90-NEXT: @%p1 bra $L__BB261_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB208_1; -; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB261_1; +; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9713,20 +12169,20 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB209_3; +; SM90-NEXT: @%p1 bra $L__BB262_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB209_1; -; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB262_1; +; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9759,20 +12215,20 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB210_3; +; SM90-NEXT: @%p1 bra $L__BB263_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB210_1; -; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB263_1; +; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9805,20 +12261,20 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB211_3; +; SM90-NEXT: @%p1 bra $L__BB264_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB211_1; -; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB264_1; +; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9826,6 +12282,52 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB265_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB265_1; +; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM90: { @@ -9851,20 +12353,20 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB212_3; +; SM90-NEXT: @%p1 bra $L__BB266_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB212_1; -; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB266_1; +; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9897,20 +12399,20 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB213_3; +; SM90-NEXT: @%p1 bra $L__BB267_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB213_1; -; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB267_1; +; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9943,20 +12445,20 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB214_3; +; SM90-NEXT: @%p1 bra $L__BB268_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB214_1; -; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB268_1; +; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -9989,20 +12491,20 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB215_3; +; SM90-NEXT: @%p1 bra $L__BB269_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB215_1; -; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB269_1; +; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10010,6 +12512,51 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB270_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB270_1; +; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_monotonic_i16_generic_sys( ; SM90: { @@ -10034,20 +12581,20 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB216_3; +; SM90-NEXT: @%p1 bra $L__BB271_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB216_1; -; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB271_1; +; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10079,20 +12626,20 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB217_3; +; SM90-NEXT: @%p1 bra $L__BB272_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB217_1; -; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB272_1; +; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10124,29 +12671,74 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB218_3; +; SM90-NEXT: @%p1 bra $L__BB273_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB218_1; -; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB273_1; +; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB274_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB274_1; +; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_gpu( +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -10154,10 +12746,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10167,26 +12759,26 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB219_3; +; SM90-NEXT: @%p1 bra $L__BB275_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB219_1; -; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB275_1; +; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic ret i16 %new } @@ -10214,20 +12806,20 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB220_3; +; SM90-NEXT: @%p1 bra $L__BB276_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB220_1; -; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB276_1; +; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10259,20 +12851,20 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB221_3; +; SM90-NEXT: @%p1 bra $L__BB277_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB221_1; -; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB277_1; +; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10304,20 +12896,20 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB222_3; +; SM90-NEXT: @%p1 bra $L__BB278_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB222_1; -; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB278_1; +; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10349,20 +12941,20 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB223_3; +; SM90-NEXT: @%p1 bra $L__BB279_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB223_1; -; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB279_1; +; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10370,6 +12962,51 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB280_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB280_1; +; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_monotonic_i16_shared_sys( ; SM90: { @@ -10394,20 +13031,20 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB224_3; +; SM90-NEXT: @%p1 bra $L__BB281_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB224_1; -; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB281_1; +; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10439,20 +13076,20 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB225_3; +; SM90-NEXT: @%p1 bra $L__BB282_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB225_1; -; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB282_1; +; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10484,20 +13121,20 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB226_3; +; SM90-NEXT: @%p1 bra $L__BB283_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB226_1; -; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB283_1; +; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10529,20 +13166,20 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB227_3; +; SM90-NEXT: @%p1 bra $L__BB284_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB227_1; -; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB284_1; +; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10550,6 +13187,51 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB285_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB285_1; +; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_acquire_i16_generic_sys( ; SM90: { @@ -10574,20 +13256,20 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB228_3; +; SM90-NEXT: @%p1 bra $L__BB286_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB228_1; -; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB286_1; +; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10619,20 +13301,20 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: @%p1 bra $L__BB287_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB229_1; -; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB287_1; +; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10664,20 +13346,20 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB230_3; +; SM90-NEXT: @%p1 bra $L__BB288_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB230_1; -; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB288_1; +; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10694,10 +13376,55 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB289_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB289_1; +; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + ret i16 %new +} + +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10707,26 +13434,26 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB231_3; +; SM90-NEXT: @%p1 bra $L__BB290_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB231_1; -; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB290_1; +; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire ret i16 %new } @@ -10754,20 +13481,20 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB232_3; +; SM90-NEXT: @%p1 bra $L__BB291_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB232_1; -; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB291_1; +; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10799,20 +13526,20 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB233_3; +; SM90-NEXT: @%p1 bra $L__BB292_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB233_1; -; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB292_1; +; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10844,20 +13571,20 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB234_3; +; SM90-NEXT: @%p1 bra $L__BB293_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB234_1; -; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB293_1; +; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10889,20 +13616,20 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB235_3; +; SM90-NEXT: @%p1 bra $L__BB294_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB235_1; -; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB294_1; +; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10910,6 +13637,51 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB295_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB295_1; +; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + ret i16 %new +} + define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_acquire_i16_shared_sys( ; SM90: { @@ -10934,20 +13706,20 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB236_3; +; SM90-NEXT: @%p1 bra $L__BB296_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB236_1; -; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB296_1; +; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -10979,20 +13751,20 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB237_3; +; SM90-NEXT: @%p1 bra $L__BB297_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB237_1; -; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB297_1; +; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11024,20 +13796,20 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB238_3; +; SM90-NEXT: @%p1 bra $L__BB298_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB238_1; -; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB298_1; +; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11069,20 +13841,20 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB239_3; +; SM90-NEXT: @%p1 bra $L__BB299_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB239_1; -; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB299_1; +; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11090,6 +13862,52 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB300_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB300_1; +; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_seq_cst_i16_generic_sys( ; SM90: { @@ -11115,20 +13933,20 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB240_3; +; SM90-NEXT: @%p1 bra $L__BB301_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB240_1; -; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB301_1; +; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11161,20 +13979,20 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB241_3; +; SM90-NEXT: @%p1 bra $L__BB302_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB241_1; -; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB302_1; +; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11207,20 +14025,20 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB242_3; +; SM90-NEXT: @%p1 bra $L__BB303_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB242_1; -; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB303_1; +; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11228,8 +14046,54 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB304_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB304_1; +; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + ret i16 %new +} + +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11237,10 +14101,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11251,26 +14115,26 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB243_3; +; SM90-NEXT: @%p1 bra $L__BB305_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB243_1; -; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB305_1; +; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } @@ -11299,20 +14163,20 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB244_3; +; SM90-NEXT: @%p1 bra $L__BB306_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB244_1; -; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB306_1; +; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11345,20 +14209,20 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB245_3; +; SM90-NEXT: @%p1 bra $L__BB307_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB245_1; -; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB307_1; +; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11391,20 +14255,20 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB246_3; +; SM90-NEXT: @%p1 bra $L__BB308_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB246_1; -; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB308_1; +; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11437,20 +14301,20 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB247_3; +; SM90-NEXT: @%p1 bra $L__BB309_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB247_1; -; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB309_1; +; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11458,6 +14322,52 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB310_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB310_1; +; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + ret i16 %new +} + define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_seq_cst_i16_shared_sys( ; SM90: { @@ -11483,20 +14393,20 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB248_3; +; SM90-NEXT: @%p1 bra $L__BB311_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB248_1; -; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB311_1; +; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11529,20 +14439,20 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB249_3; +; SM90-NEXT: @%p1 bra $L__BB312_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB249_1; -; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB312_1; +; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11575,20 +14485,20 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB250_3; +; SM90-NEXT: @%p1 bra $L__BB313_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB250_1; -; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB313_1; +; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11621,20 +14531,20 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB251_3; +; SM90-NEXT: @%p1 bra $L__BB314_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB251_1; -; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB314_1; +; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11642,6 +14552,51 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB315_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB315_1; +; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_monotonic_i16_generic_sys( ; SM90: { @@ -11667,20 +14622,20 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB252_3; +; SM90-NEXT: @%p1 bra $L__BB316_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB252_1; -; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB316_1; +; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -11712,20 +14667,20 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB253_3; +; SM90-NEXT: @%p1 bra $L__BB317_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB253_1; -; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB317_1; +; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -11757,28 +14712,73 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB254_3; +; SM90-NEXT: @%p1 bra $L__BB318_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB254_1; -; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB318_1; +; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + ret i16 %new +} + +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB319_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB319_1; +; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_gpu( +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11786,10 +14786,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11800,25 +14800,25 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB255_3; +; SM90-NEXT: @%p1 bra $L__BB320_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB255_1; -; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB320_1; +; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } @@ -11847,20 +14847,20 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB256_3; +; SM90-NEXT: @%p1 bra $L__BB321_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB256_1; -; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB321_1; +; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -11892,20 +14892,20 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB257_3; +; SM90-NEXT: @%p1 bra $L__BB322_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB257_1; -; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB322_1; +; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -11937,20 +14937,20 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB258_3; +; SM90-NEXT: @%p1 bra $L__BB323_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB258_1; -; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB323_1; +; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic @@ -11982,26 +14982,71 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB259_3; +; SM90-NEXT: @%p1 bra $L__BB324_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB259_1; -; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB324_1; +; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB325_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB325_1; +; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + ret i16 %new +} + define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_monotonic_i16_shared_sys( ; SM90: { @@ -12027,20 +15072,20 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB260_3; +; SM90-NEXT: @%p1 bra $L__BB326_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB260_1; -; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB326_1; +; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -12072,20 +15117,20 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB261_3; +; SM90-NEXT: @%p1 bra $L__BB327_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB261_1; -; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB327_1; +; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -12117,20 +15162,20 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB262_3; +; SM90-NEXT: @%p1 bra $L__BB328_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB262_1; -; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB328_1; +; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic @@ -12162,26 +15207,72 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB263_3; +; SM90-NEXT: @%p1 bra $L__BB329_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB263_1; -; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB329_1; +; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB330_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB330_1; +; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_acquire_i16_generic_sys( ; SM90: { @@ -12207,20 +15298,20 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB264_3; +; SM90-NEXT: @%p1 bra $L__BB331_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB264_1; -; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB331_1; +; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12253,20 +15344,20 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB265_3; +; SM90-NEXT: @%p1 bra $L__BB332_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB265_1; -; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB332_1; +; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12299,29 +15390,75 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB266_3; +; SM90-NEXT: @%p1 bra $L__BB333_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB333_1; +; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + ret i16 %new +} + +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB266_1; -; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB334_1; +; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_gpu( +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -12329,10 +15466,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12343,26 +15480,26 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB267_3; +; SM90-NEXT: @%p1 bra $L__BB335_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB267_1; -; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB335_1; +; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire ret i16 %new } @@ -12391,20 +15528,20 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB268_3; +; SM90-NEXT: @%p1 bra $L__BB336_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB268_1; -; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB336_1; +; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12437,20 +15574,20 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB269_3; +; SM90-NEXT: @%p1 bra $L__BB337_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB269_1; -; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB337_1; +; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12483,20 +15620,20 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB270_3; +; SM90-NEXT: @%p1 bra $L__BB338_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB270_1; -; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB338_1; +; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12529,20 +15666,20 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB271_3; +; SM90-NEXT: @%p1 bra $L__BB339_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB271_1; -; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB339_1; +; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12550,6 +15687,52 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB340_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB340_1; +; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + ret i16 %new +} + define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_acquire_i16_shared_sys( ; SM90: { @@ -12575,20 +15758,20 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB272_3; +; SM90-NEXT: @%p1 bra $L__BB341_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB272_1; -; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB341_1; +; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12621,20 +15804,20 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB273_3; +; SM90-NEXT: @%p1 bra $L__BB342_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB273_1; -; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB342_1; +; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12667,20 +15850,20 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB274_3; +; SM90-NEXT: @%p1 bra $L__BB343_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB274_1; -; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB343_1; +; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12713,20 +15896,20 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB275_3; +; SM90-NEXT: @%p1 bra $L__BB344_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB275_1; -; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB344_1; +; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12734,6 +15917,52 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB345_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB345_1; +; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_seq_cst_i16_generic_sys( ; SM90: { @@ -12759,20 +15988,20 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB276_3; +; SM90-NEXT: @%p1 bra $L__BB346_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB276_1; -; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB346_1; +; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12805,20 +16034,20 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB277_3; +; SM90-NEXT: @%p1 bra $L__BB347_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB277_1; -; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB347_1; +; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12835,10 +16064,56 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB348_1; +; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + ret i16 %new +} + +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12851,29 +16126,29 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB278_3; +; SM90-NEXT: @%p1 bra $L__BB349_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB278_1; -; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB349_1; +; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_gpu( +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -12881,10 +16156,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12895,26 +16170,26 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB279_3; +; SM90-NEXT: @%p1 bra $L__BB350_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB279_1; -; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB350_1; +; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } @@ -12943,20 +16218,20 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB280_3; +; SM90-NEXT: @%p1 bra $L__BB351_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB280_1; -; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB351_1; +; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12989,20 +16264,20 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB281_3; +; SM90-NEXT: @%p1 bra $L__BB352_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB281_1; -; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB352_1; +; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13035,20 +16310,20 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB282_3; +; SM90-NEXT: @%p1 bra $L__BB353_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB282_1; -; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB353_1; +; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13081,20 +16356,20 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB283_3; +; SM90-NEXT: @%p1 bra $L__BB354_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB283_1; -; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB354_1; +; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13102,6 +16377,52 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB355_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB355_1; +; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + ret i16 %new +} + define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_seq_cst_i16_shared_sys( ; SM90: { @@ -13127,20 +16448,20 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB284_3; +; SM90-NEXT: @%p1 bra $L__BB356_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB284_1; -; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB356_1; +; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13173,20 +16494,20 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB285_3; +; SM90-NEXT: @%p1 bra $L__BB357_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB285_1; -; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB357_1; +; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13219,20 +16540,20 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB286_3; +; SM90-NEXT: @%p1 bra $L__BB358_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB286_1; -; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB358_1; +; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13265,20 +16586,20 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB287_3; +; SM90-NEXT: @%p1 bra $L__BB359_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB287_1; -; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB359_1; +; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13286,6 +16607,52 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB360_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB360_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB360_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB360_1; +; SM90-NEXT: $L__BB360_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM90: { @@ -13311,20 +16678,20 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB361_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB288_3; +; SM90-NEXT: @%p1 bra $L__BB361_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB361_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB288_1; -; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB361_1; +; SM90-NEXT: $L__BB361_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13357,20 +16724,20 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB362_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB289_3; +; SM90-NEXT: @%p1 bra $L__BB362_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB362_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB289_1; -; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB362_1; +; SM90-NEXT: $L__BB362_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13378,8 +16745,54 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( +define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB363_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB363_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB363_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB363_1; +; SM90-NEXT: $L__BB363_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + ret i16 %new +} + +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -13387,10 +16800,10 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13403,29 +16816,29 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB364_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB290_3; +; SM90-NEXT: @%p1 bra $L__BB364_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB364_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB290_1; -; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB364_1; +; SM90-NEXT: $L__BB364_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -13433,10 +16846,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13447,26 +16860,26 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB365_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB291_3; +; SM90-NEXT: @%p1 bra $L__BB365_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB365_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB291_1; -; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB365_1; +; SM90-NEXT: $L__BB365_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } @@ -13495,20 +16908,20 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB366_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB292_3; +; SM90-NEXT: @%p1 bra $L__BB366_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB366_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB292_1; -; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB366_1; +; SM90-NEXT: $L__BB366_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13541,20 +16954,20 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB367_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB293_3; +; SM90-NEXT: @%p1 bra $L__BB367_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB367_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB293_1; -; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB367_1; +; SM90-NEXT: $L__BB367_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13587,20 +17000,20 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB368_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB294_3; +; SM90-NEXT: @%p1 bra $L__BB368_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB368_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB294_1; -; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB368_1; +; SM90-NEXT: $L__BB368_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13633,20 +17046,20 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB369_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB295_3; +; SM90-NEXT: @%p1 bra $L__BB369_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB369_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB295_1; -; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB369_1; +; SM90-NEXT: $L__BB369_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13654,6 +17067,52 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB370_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB370_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB370_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB370_1; +; SM90-NEXT: $L__BB370_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + ret i16 %new +} + define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( ; SM90: { @@ -13679,20 +17138,20 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB371_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB296_3; +; SM90-NEXT: @%p1 bra $L__BB371_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB371_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB296_1; -; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB371_1; +; SM90-NEXT: $L__BB371_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13725,20 +17184,20 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB372_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB297_3; +; SM90-NEXT: @%p1 bra $L__BB372_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB372_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB297_1; -; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB372_1; +; SM90-NEXT: $L__BB372_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13771,20 +17230,20 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB373_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB298_3; +; SM90-NEXT: @%p1 bra $L__BB373_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB373_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB298_1; -; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB373_1; +; SM90-NEXT: $L__BB373_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13817,20 +17276,20 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB374_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB299_3; +; SM90-NEXT: @%p1 bra $L__BB374_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB374_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB299_1; -; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB374_1; +; SM90-NEXT: $L__BB374_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13838,6 +17297,52 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB375_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB375_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB375_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB375_1; +; SM90-NEXT: $L__BB375_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_acquire_i16_generic_sys( ; SM90: { @@ -13863,20 +17368,20 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB376_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB300_3; +; SM90-NEXT: @%p1 bra $L__BB376_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB376_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB300_1; -; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB376_1; +; SM90-NEXT: $L__BB376_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13909,20 +17414,20 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB377_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB301_3; +; SM90-NEXT: @%p1 bra $L__BB377_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB377_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB301_1; -; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB377_1; +; SM90-NEXT: $L__BB377_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13955,20 +17460,20 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB378_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB302_3; +; SM90-NEXT: @%p1 bra $L__BB378_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB378_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB302_1; -; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB378_1; +; SM90-NEXT: $L__BB378_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14001,20 +17506,20 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB379_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB303_3; +; SM90-NEXT: @%p1 bra $L__BB379_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB379_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB303_1; -; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB379_1; +; SM90-NEXT: $L__BB379_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14022,6 +17527,52 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB380_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB380_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB380_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB380_1; +; SM90-NEXT: $L__BB380_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + ret i16 %new +} + define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_acquire_i16_global_sys( ; SM90: { @@ -14047,20 +17598,20 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB381_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB304_3; +; SM90-NEXT: @%p1 bra $L__BB381_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB381_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB304_1; -; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB381_1; +; SM90-NEXT: $L__BB381_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14093,20 +17644,20 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB382_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB305_3; +; SM90-NEXT: @%p1 bra $L__BB382_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB382_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB305_1; -; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB382_1; +; SM90-NEXT: $L__BB382_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14123,10 +17674,56 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB383_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB383_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB383_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB383_1; +; SM90-NEXT: $L__BB383_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + ret i16 %new +} + +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14139,29 +17736,29 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB384_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB306_3; +; SM90-NEXT: @%p1 bra $L__BB384_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB384_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB306_1; -; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB384_1; +; SM90-NEXT: $L__BB384_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_shared( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -14169,10 +17766,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14183,26 +17780,26 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB385_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB307_3; +; SM90-NEXT: @%p1 bra $L__BB385_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB385_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB307_1; -; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB385_1; +; SM90-NEXT: $L__BB385_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire ret i16 %new } @@ -14231,20 +17828,20 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB386_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB308_3; +; SM90-NEXT: @%p1 bra $L__BB386_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB386_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB308_1; -; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB386_1; +; SM90-NEXT: $L__BB386_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14277,20 +17874,20 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB387_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB309_3; +; SM90-NEXT: @%p1 bra $L__BB387_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB387_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB309_1; -; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB387_1; +; SM90-NEXT: $L__BB387_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14323,20 +17920,20 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB388_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB310_3; +; SM90-NEXT: @%p1 bra $L__BB388_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB388_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB310_1; -; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB388_1; +; SM90-NEXT: $L__BB388_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14369,20 +17966,20 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB389_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB311_3; +; SM90-NEXT: @%p1 bra $L__BB389_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB389_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB311_1; -; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB389_1; +; SM90-NEXT: $L__BB389_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14390,6 +17987,52 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB390_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB390_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB390_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB390_1; +; SM90-NEXT: $L__BB390_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM90: { @@ -14415,20 +18058,20 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB391_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB312_3; +; SM90-NEXT: @%p1 bra $L__BB391_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB391_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB312_1; -; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB391_1; +; SM90-NEXT: $L__BB391_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14461,20 +18104,20 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB392_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB313_3; +; SM90-NEXT: @%p1 bra $L__BB392_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB392_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB313_1; -; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB392_1; +; SM90-NEXT: $L__BB392_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14507,20 +18150,20 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB393_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB314_3; +; SM90-NEXT: @%p1 bra $L__BB393_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB393_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB314_1; -; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB393_1; +; SM90-NEXT: $L__BB393_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14553,20 +18196,20 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB394_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB315_3; +; SM90-NEXT: @%p1 bra $L__BB394_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB394_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB315_1; -; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB394_1; +; SM90-NEXT: $L__BB394_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14574,6 +18217,52 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB395_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB395_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB395_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB395_1; +; SM90-NEXT: $L__BB395_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( ; SM90: { @@ -14599,20 +18288,20 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB396_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB316_3; +; SM90-NEXT: @%p1 bra $L__BB396_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB396_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB316_1; -; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB396_1; +; SM90-NEXT: $L__BB396_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14645,20 +18334,20 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB397_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB317_3; +; SM90-NEXT: @%p1 bra $L__BB397_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB397_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB317_1; -; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB397_1; +; SM90-NEXT: $L__BB397_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14691,20 +18380,20 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB398_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB318_3; +; SM90-NEXT: @%p1 bra $L__BB398_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB398_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB318_1; -; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB398_1; +; SM90-NEXT: $L__BB398_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14737,20 +18426,20 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB399_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB319_3; +; SM90-NEXT: @%p1 bra $L__BB399_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB399_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB319_1; -; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB399_1; +; SM90-NEXT: $L__BB399_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14758,6 +18447,52 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB400_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB400_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB400_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB400_1; +; SM90-NEXT: $L__BB400_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + ret i16 %new +} + define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM90: { @@ -14783,20 +18518,20 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB401_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB320_3; +; SM90-NEXT: @%p1 bra $L__BB401_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB401_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB320_1; -; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB401_1; +; SM90-NEXT: $L__BB401_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14829,20 +18564,20 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB402_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB321_3; +; SM90-NEXT: @%p1 bra $L__BB402_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB402_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB321_1; -; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB402_1; +; SM90-NEXT: $L__BB402_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14850,8 +18585,54 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( +define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB403_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB403_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB403_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB403_1; +; SM90-NEXT: $L__BB403_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + ret i16 %new +} + +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -14859,10 +18640,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14875,29 +18656,29 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB404_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB322_3; +; SM90-NEXT: @%p1 bra $L__BB404_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB404_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB322_1; -; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB404_1; +; SM90-NEXT: $L__BB404_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_generic( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -14905,10 +18686,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14919,26 +18700,26 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB405_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB323_3; +; SM90-NEXT: @%p1 bra $L__BB405_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB405_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB323_1; -; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB405_1; +; SM90-NEXT: $L__BB405_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic ret i16 %new } @@ -14967,20 +18748,20 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB406_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB324_3; +; SM90-NEXT: @%p1 bra $L__BB406_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB406_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB324_1; -; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB406_1; +; SM90-NEXT: $L__BB406_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15013,20 +18794,20 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB407_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB325_3; +; SM90-NEXT: @%p1 bra $L__BB407_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB407_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB325_1; -; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB407_1; +; SM90-NEXT: $L__BB407_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15059,20 +18840,20 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB408_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB326_3; +; SM90-NEXT: @%p1 bra $L__BB408_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB408_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB326_1; -; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB408_1; +; SM90-NEXT: $L__BB408_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15105,20 +18886,20 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB409_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB327_3; +; SM90-NEXT: @%p1 bra $L__BB409_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB409_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB327_1; -; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB409_1; +; SM90-NEXT: $L__BB409_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15126,6 +18907,52 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB410_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB410_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB410_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB410_1; +; SM90-NEXT: $L__BB410_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_monotonic_i16_global_sys( ; SM90: { @@ -15151,20 +18978,20 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB411_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB328_3; +; SM90-NEXT: @%p1 bra $L__BB411_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB411_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB328_1; -; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB411_1; +; SM90-NEXT: $L__BB411_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15197,20 +19024,20 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB412_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB329_3; +; SM90-NEXT: @%p1 bra $L__BB412_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB412_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB329_1; -; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB412_1; +; SM90-NEXT: $L__BB412_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15243,20 +19070,20 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB413_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB330_3; +; SM90-NEXT: @%p1 bra $L__BB413_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB413_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB330_1; -; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB413_1; +; SM90-NEXT: $L__BB413_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15289,20 +19116,20 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB414_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB331_3; +; SM90-NEXT: @%p1 bra $L__BB414_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB414_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB331_1; -; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB414_1; +; SM90-NEXT: $L__BB414_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15310,6 +19137,52 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB415_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB415_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB415_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB415_1; +; SM90-NEXT: $L__BB415_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + ret i16 %new +} + define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM90: { @@ -15335,20 +19208,20 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB416_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB332_3; +; SM90-NEXT: @%p1 bra $L__BB416_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB416_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB332_1; -; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB416_1; +; SM90-NEXT: $L__BB416_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15381,20 +19254,20 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB417_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB333_3; +; SM90-NEXT: @%p1 bra $L__BB417_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB417_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB333_1; -; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB417_1; +; SM90-NEXT: $L__BB417_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15427,20 +19300,20 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB418_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: @%p1 bra $L__BB418_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB418_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB334_1; -; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB418_1; +; SM90-NEXT: $L__BB418_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15473,20 +19346,20 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB419_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB335_3; +; SM90-NEXT: @%p1 bra $L__BB419_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB419_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB335_1; -; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB419_1; +; SM90-NEXT: $L__BB419_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15494,6 +19367,52 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB420_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB420_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB420_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB420_1; +; SM90-NEXT: $L__BB420_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_acquire_i16_generic_sys( ; SM90: { @@ -15519,20 +19438,20 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB421_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB336_3; +; SM90-NEXT: @%p1 bra $L__BB421_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB421_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB336_1; -; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB421_1; +; SM90-NEXT: $L__BB421_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15565,20 +19484,20 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB422_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB337_3; +; SM90-NEXT: @%p1 bra $L__BB422_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB422_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB337_1; -; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB422_1; +; SM90-NEXT: $L__BB422_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15595,10 +19514,56 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB423_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB423_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB423_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB423_1; +; SM90-NEXT: $L__BB423_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + ret i16 %new +} + +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15611,29 +19576,29 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB424_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB338_3; +; SM90-NEXT: @%p1 bra $L__BB424_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB424_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB338_1; -; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: @%p2 bra $L__BB424_1; +; SM90-NEXT: $L__BB424_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -15641,10 +19606,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15655,26 +19620,26 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB425_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB339_3; +; SM90-NEXT: @%p1 bra $L__BB425_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB425_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB339_1; -; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB425_1; +; SM90-NEXT: $L__BB425_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } @@ -15703,20 +19668,20 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB426_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB340_3; +; SM90-NEXT: @%p1 bra $L__BB426_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB426_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB340_1; -; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB426_1; +; SM90-NEXT: $L__BB426_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15749,20 +19714,20 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB427_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB341_3; +; SM90-NEXT: @%p1 bra $L__BB427_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB427_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB341_1; -; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB427_1; +; SM90-NEXT: $L__BB427_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15795,20 +19760,20 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB428_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB342_3; +; SM90-NEXT: @%p1 bra $L__BB428_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB428_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB342_1; -; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB428_1; +; SM90-NEXT: $L__BB428_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15841,20 +19806,20 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB429_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB343_3; +; SM90-NEXT: @%p1 bra $L__BB429_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB429_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB343_1; -; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB429_1; +; SM90-NEXT: $L__BB429_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15862,6 +19827,52 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB430_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB430_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB430_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB430_1; +; SM90-NEXT: $L__BB430_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire + ret i16 %new +} + define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_acquire_i16_shared_sys( ; SM90: { @@ -15887,20 +19898,20 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB431_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB344_3; +; SM90-NEXT: @%p1 bra $L__BB431_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB431_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB344_1; -; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB431_1; +; SM90-NEXT: $L__BB431_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15933,20 +19944,20 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB432_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB345_3; +; SM90-NEXT: @%p1 bra $L__BB432_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB432_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB345_1; -; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB432_1; +; SM90-NEXT: $L__BB432_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15979,20 +19990,20 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB433_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB346_3; +; SM90-NEXT: @%p1 bra $L__BB433_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB433_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB346_1; -; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB433_1; +; SM90-NEXT: $L__BB433_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16025,20 +20036,20 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB434_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB347_3; +; SM90-NEXT: @%p1 bra $L__BB434_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB434_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB347_1; -; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB434_1; +; SM90-NEXT: $L__BB434_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16046,6 +20057,52 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_generic( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB435_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB435_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB435_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB435_1; +; SM90-NEXT: $L__BB435_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM90: { @@ -16071,20 +20128,20 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB436_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: @%p1 bra $L__BB436_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB436_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB348_1; -; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB436_1; +; SM90-NEXT: $L__BB436_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16117,20 +20174,20 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB437_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB349_3; +; SM90-NEXT: @%p1 bra $L__BB437_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB437_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB349_1; -; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB437_1; +; SM90-NEXT: $L__BB437_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16163,20 +20220,20 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB438_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB350_3; +; SM90-NEXT: @%p1 bra $L__BB438_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB438_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB350_1; -; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB438_1; +; SM90-NEXT: $L__BB438_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16209,20 +20266,20 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB439_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB351_3; +; SM90-NEXT: @%p1 bra $L__BB439_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB439_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB351_1; -; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB439_1; +; SM90-NEXT: $L__BB439_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16230,6 +20287,52 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB440_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB440_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB440_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB440_1; +; SM90-NEXT: $L__BB440_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM90: { @@ -16255,20 +20358,20 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB441_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB352_3; +; SM90-NEXT: @%p1 bra $L__BB441_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB441_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB352_1; -; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB441_1; +; SM90-NEXT: $L__BB441_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16301,20 +20404,20 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB442_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB353_3; +; SM90-NEXT: @%p1 bra $L__BB442_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB442_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB353_1; -; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB442_1; +; SM90-NEXT: $L__BB442_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16347,20 +20450,20 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB443_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB354_3; +; SM90-NEXT: @%p1 bra $L__BB443_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB443_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB354_1; -; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB443_1; +; SM90-NEXT: $L__BB443_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16393,24 +20496,70 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB444_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB444_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB444_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB444_1; +; SM90-NEXT: $L__BB444_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + ret i16 %new +} + +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_shared( +; SM90: { +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: $L__BB445_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB355_3; +; SM90-NEXT: @%p1 bra $L__BB445_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB445_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB355_1; -; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: @%p2 bra $L__BB445_1; +; SM90-NEXT: $L__BB445_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -16439,20 +20588,20 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB446_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB356_3; +; SM90-NEXT: @%p1 bra $L__BB446_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB446_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB356_1; -; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB446_1; +; SM90-NEXT: $L__BB446_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16485,20 +20634,20 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB447_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB357_3; +; SM90-NEXT: @%p1 bra $L__BB447_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB447_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB357_1; -; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB447_1; +; SM90-NEXT: $L__BB447_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16531,20 +20680,20 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB448_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB358_3; +; SM90-NEXT: @%p1 bra $L__BB448_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB448_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB358_1; -; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB448_1; +; SM90-NEXT: $L__BB448_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16577,20 +20726,20 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB449_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB359_3; +; SM90-NEXT: @%p1 bra $L__BB449_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB449_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB359_1; -; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM90-NEXT: @%p2 bra $L__BB449_1; +; SM90-NEXT: $L__BB449_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16598,6 +20747,23 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_generic_sys( ; SM90: { @@ -16666,6 +20832,23 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_global_sys( ; SM90: { @@ -16734,6 +20917,23 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_shared_sys( ; SM90: { @@ -16802,6 +21002,23 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_generic_sys( ; SM90: { @@ -16870,6 +21087,23 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_global_sys( ; SM90: { @@ -16938,6 +21172,23 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_shared_sys( ; SM90: { @@ -17006,6 +21257,24 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM90: { @@ -17078,6 +21347,24 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_global_sys( ; SM90: { @@ -17150,6 +21437,24 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM90: { @@ -17222,6 +21527,23 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_generic_sys( ; SM90: { @@ -17290,6 +21612,23 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_global_sys( ; SM90: { @@ -17358,6 +21697,23 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_shared_sys( ; SM90: { @@ -17426,6 +21782,23 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_generic_sys( ; SM90: { @@ -17494,6 +21867,23 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_global_sys( ; SM90: { @@ -17562,6 +21952,23 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_shared_sys( ; SM90: { @@ -17630,6 +22037,24 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_generic_sys( ; SM90: { @@ -17702,6 +22127,24 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_global_sys( ; SM90: { @@ -17774,6 +22217,24 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_shared_sys( ; SM90: { @@ -17846,6 +22307,23 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_monotonic_i32_generic_sys( ; SM90: { @@ -17910,7 +22388,24 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + ret i32 %new +} + +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic ret i32 %new } @@ -17982,6 +22477,23 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_monotonic_i32_shared_sys( ; SM90: { @@ -18050,6 +22562,23 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_generic_sys( ; SM90: { @@ -18118,6 +22647,23 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_global_sys( ; SM90: { @@ -18186,6 +22732,23 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_shared_sys( ; SM90: { @@ -18254,6 +22817,24 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_generic_sys( ; SM90: { @@ -18326,6 +22907,24 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_global_sys( ; SM90: { @@ -18398,6 +22997,24 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_shared_sys( ; SM90: { @@ -18470,6 +23087,23 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM90: { @@ -18538,6 +23172,23 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_global_sys( ; SM90: { @@ -18606,6 +23257,23 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM90: { @@ -18674,6 +23342,23 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_generic_sys( ; SM90: { @@ -18742,6 +23427,23 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_global_sys( ; SM90: { @@ -18810,6 +23512,23 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_shared_sys( ; SM90: { @@ -18878,6 +23597,24 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM90: { @@ -18950,6 +23687,24 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM90: { @@ -19022,6 +23777,24 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM90: { @@ -19094,6 +23867,24 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM90: { @@ -19166,6 +23957,24 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_global_sys( ; SM90: { @@ -19238,6 +24047,24 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM90: { @@ -19310,6 +24137,24 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_acquire_i32_generic_sys( ; SM90: { @@ -19382,6 +24227,24 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_acquire_i32_global_sys( ; SM90: { @@ -19450,7 +24313,25 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire + ret i32 %new +} + +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire ret i32 %new } @@ -19526,6 +24407,24 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_generic( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM90: { @@ -19598,6 +24497,24 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM90: { @@ -19670,6 +24587,24 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_shared( +; SM90: { +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM90: { @@ -19742,6 +24677,22 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_generic_sys( ; SM90: { @@ -19806,6 +24757,22 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_global_sys( ; SM90: { @@ -19870,6 +24837,22 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_shared_sys( ; SM90: { @@ -19934,6 +24917,22 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_generic_sys( ; SM90: { @@ -19998,6 +24997,22 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_global_sys( ; SM90: { @@ -20062,6 +25077,22 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_shared_sys( ; SM90: { @@ -20126,6 +25157,23 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM90: { @@ -20194,6 +25242,23 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_global_sys( ; SM90: { @@ -20262,6 +25327,23 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM90: { @@ -20330,6 +25412,22 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_generic_sys( ; SM90: { @@ -20394,6 +25492,22 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_global_sys( ; SM90: { @@ -20458,6 +25572,22 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_shared_sys( ; SM90: { @@ -20522,6 +25652,22 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_generic_sys( ; SM90: { @@ -20586,6 +25732,22 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_global_sys( ; SM90: { @@ -20650,6 +25812,22 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_shared_sys( ; SM90: { @@ -20714,6 +25892,23 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_generic_sys( ; SM90: { @@ -20782,6 +25977,23 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_global_sys( ; SM90: { @@ -20850,6 +26062,23 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_shared_sys( ; SM90: { @@ -20918,6 +26147,22 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_monotonic_i64_generic_sys( ; SM90: { @@ -20982,6 +26227,22 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_monotonic_i64_global_sys( ; SM90: { @@ -21042,7 +26303,23 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic + ret i64 %new +} + +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic ret i64 %new } @@ -21110,6 +26387,22 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_generic_sys( ; SM90: { @@ -21174,6 +26467,22 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_global_sys( ; SM90: { @@ -21238,6 +26547,22 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_shared_sys( ; SM90: { @@ -21302,6 +26627,23 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_generic_sys( ; SM90: { @@ -21370,6 +26712,23 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_global_sys( ; SM90: { @@ -21438,6 +26797,23 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_shared_sys( ; SM90: { @@ -21506,6 +26882,22 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM90: { @@ -21570,6 +26962,22 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_global_sys( ; SM90: { @@ -21634,6 +27042,22 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM90: { @@ -21698,6 +27122,22 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_generic_sys( ; SM90: { @@ -21762,6 +27202,22 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_global_sys( ; SM90: { @@ -21826,6 +27282,22 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_shared_sys( ; SM90: { @@ -21890,6 +27362,23 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM90: { @@ -21958,6 +27447,23 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM90: { @@ -22026,6 +27532,23 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM90: { @@ -22094,6 +27617,23 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM90: { @@ -22162,6 +27702,23 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_global_sys( ; SM90: { @@ -22230,6 +27787,23 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM90: { @@ -22298,6 +27872,23 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_generic_sys( ; SM90: { @@ -22366,6 +27957,23 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_global_sys( ; SM90: { @@ -22434,6 +28042,23 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_shared_sys( ; SM90: { @@ -22502,6 +28127,23 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_generic( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM90: { @@ -22570,6 +28212,23 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM90: { @@ -22638,6 +28297,23 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_shared( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM90: { diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index 277704bd9d5a5..263627fea8a50 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -12,6 +12,14 @@ """ ) +cmpxchg_func_no_scope = Template( + """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) { + %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure + ret i$size %new +} +""" +) + run_statement = Template( """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %} @@ -38,25 +46,38 @@ for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace, llvm_scope in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES + for size, success, failure, addrspace in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES ): - # cluster ordering is supported from SM90 onwards - if sm != 90 and llvm_scope == "cluster": - continue if addrspace == 0: addrspace_cast = "" else: addrspace_cast = " addrspace({})".format(str(addrspace)) + # Test default scope print( - cmpxchg_func.substitute( + cmpxchg_func_no_scope.substitute( success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast, - llvm_scope=llvm_scope, - ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, ) + + for llvm_scope in LLVM_SCOPES: + # cluster ordering is supported from SM90 onwards + if sm < 90 and llvm_scope == "cluster": + continue + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=addrspace_cast, + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) From 188252516a9dfe2cdc39eccb1e406f147d795e0e Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 9 May 2025 01:00:43 +0000 Subject: [PATCH 07/26] [NVPTX] Add syncscope support for cmpxchg --- llvm/include/llvm/CodeGen/TargetLowering.h | 16 +- llvm/lib/CodeGen/AtomicExpandPass.cpp | 15 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +- llvm/lib/Target/ARM/ARMISelLowering.h | 10 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.h | 11 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 13600 +++++----------- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 13700 +++++----------- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 15064 ++++++------------ llvm/test/CodeGen/NVPTX/cmpxchg.ll | 640 +- llvm/test/CodeGen/NVPTX/cmpxchg.py | 9 +- 15 files changed, 13062 insertions(+), 30049 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 9c3cede359c15..d11e2ca22b189 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2325,13 +2325,15 @@ class LLVM_ABI TargetLoweringBase { /// standard ABI uses a fence before a seq_cst load instead of after a /// seq_cst store). /// @{ - virtual Instruction *emitLeadingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; - - virtual Instruction *emitTrailingFence(IRBuilderBase &Builder, - Instruction *Inst, - AtomicOrdering Ord) const; + virtual Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; + + virtual Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3f3d5dc90711f..bc400b28d26af 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -314,6 +314,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; + SyncScope::ID SSID = SyncScope::System; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); @@ -336,13 +337,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); + SSID = CASI->getSyncScopeID(); CASI->setSuccessOrdering(CASOrdering); CASI->setFailureOrdering(CASOrdering); + // If CAS ordering is monotonic, then the operation will + // take default scope. Otherwise, it will retain its scope + if (CASOrdering != AtomicOrdering::Monotonic) + CASI->setSyncScopeID(SSID); } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering); + MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID); } } else if (I->hasAtomicStore() && TLI->shouldInsertTrailingFenceForAtomicStore(I)) { @@ -443,12 +449,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F, } bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, - AtomicOrdering Order) { + AtomicOrdering Order, + SyncScope::ID SSID) { ReplacementIRBuilder Builder(I, *DL); - auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID); - auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID); // We have a guard here because not every atomic operation generates a // trailing fence. if (TrailingFence) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 0a077b7b61437..6c4a480b5ca87 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2383,18 +2383,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (isAcquireOrStronger(Ord)) - return Builder.CreateFence(Ord); + return Builder.CreateFence(Ord, SSID); else return nullptr; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 05ca11cfac5cb..1a409c3165f49 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21190,7 +21190,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -21215,7 +21216,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 604910e04d4cc..79926386cde1e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -674,10 +674,12 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction *emitLeadingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + Instruction *emitTrailingFence( + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f038483aa4298..2fe4adfdaacb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6266,7 +6266,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); @@ -6284,7 +6285,8 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { // Specialize for cmpxchg if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index b96505816dee8..7e2bd684a3e06 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12820,7 +12820,8 @@ Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder, // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -12830,7 +12831,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 4c88bd372b106..8f1793ac1136f 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -927,6 +927,7 @@ namespace llvm { return true; } +<<<<<<< HEAD Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override; @@ -937,6 +938,16 @@ namespace llvm { AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; +======= + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; +>>>>>>> [NVPTX] Add syncscope support for cmpxchg bool shouldInlineQuadwordAtomics() const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 35fbac04b3405..a08b4aac24e06 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23306,7 +23306,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); @@ -23322,7 +23323,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const { + AtomicOrdering Ord, + SyncScope::ID SSID) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a1b283e35074a..05ea2e5759f80 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -232,10 +232,14 @@ class RISCVTargetLowering : public TargetLowering { // than this hook due to limitations in the interface here. bool shouldInsertFencesForAtomic(const Instruction *I) const override; - Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction * + emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; + Instruction * + emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord, + SyncScope::ID SSID = SyncScope::System) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 11f79acb6060e..40e365c886c42 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -446,14 +446,15 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -461,8 +462,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -473,9 +474,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -491,14 +492,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -506,8 +508,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -518,9 +520,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -536,14 +538,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -551,8 +554,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -563,9 +566,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -584,12 +587,12 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -597,8 +600,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -609,9 +612,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -627,15 +630,15 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -643,8 +646,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -655,9 +658,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -673,15 +676,15 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -689,8 +692,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -701,9 +704,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -719,15 +722,15 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -735,8 +738,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -747,9 +750,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -765,15 +768,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -781,8 +784,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -793,9 +796,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -811,15 +814,15 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -827,8 +830,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -839,9 +843,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -857,15 +861,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -873,8 +877,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -885,9 +890,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -903,15 +908,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -919,8 +924,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -931,9 +937,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -949,15 +955,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -965,8 +971,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -977,9 +984,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -998,12 +1005,12 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1011,8 +1018,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1023,9 +1031,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1044,12 +1052,12 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1057,8 +1065,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1069,9 +1078,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1090,12 +1099,12 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1103,8 +1112,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1116,9 +1125,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1137,12 +1146,12 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1150,9 +1159,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1163,9 +1172,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1181,15 +1190,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1197,9 +1206,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1210,9 +1219,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1228,15 +1237,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1244,9 +1253,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1257,9 +1265,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1275,15 +1283,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1291,9 +1299,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1304,9 +1311,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1322,15 +1329,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1338,9 +1345,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1351,9 +1357,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1369,15 +1375,15 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1385,9 +1391,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1398,9 +1403,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1416,15 +1421,15 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1432,9 +1437,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1445,9 +1449,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1463,15 +1467,15 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1479,9 +1483,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1492,9 +1495,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1510,15 +1513,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1526,9 +1529,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1539,9 +1541,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1560,12 +1562,12 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1573,9 +1575,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1586,9 +1587,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1607,12 +1608,12 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1620,9 +1621,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1633,9 +1633,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1654,12 +1654,12 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1667,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1679,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1700,12 +1700,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1713,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1743,15 +1743,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1759,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1789,15 +1789,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1805,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1835,15 +1835,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1851,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1863,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1881,15 +1881,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1897,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1927,15 +1927,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1943,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1973,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1989,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2019,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2035,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2065,15 +2065,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2081,8 +2081,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2094,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2114,12 +2115,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2127,8 +2128,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2141,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2160,12 +2162,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2173,8 +2175,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2188,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2206,12 +2209,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2219,8 +2222,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2231,9 +2235,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2252,12 +2256,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2265,8 +2269,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2277,9 +2282,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2295,15 +2300,15 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2311,8 +2316,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2329,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2341,15 +2347,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2357,8 +2363,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2376,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2387,15 +2394,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2403,8 +2410,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2415,9 +2423,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2433,15 +2441,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2449,8 +2457,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2461,9 +2470,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2479,15 +2488,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2495,8 +2504,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2507,9 +2517,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2525,15 +2535,14 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_gpu( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2541,8 +2550,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2563,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2571,15 +2581,14 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2587,8 +2596,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2599,9 +2609,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2617,15 +2627,14 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_sys( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2633,8 +2642,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2645,9 +2655,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2663,15 +2673,14 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_cta( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2679,8 +2688,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2691,9 +2701,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2709,15 +2719,14 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2725,8 +2734,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2737,9 +2747,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2755,15 +2765,14 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2771,8 +2780,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2784,9 +2793,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2802,15 +2811,14 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2818,9 +2826,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2831,9 +2839,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2849,15 +2857,14 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2865,9 +2872,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2878,9 +2885,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2896,15 +2903,14 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2912,9 +2918,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2925,9 +2931,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2943,15 +2949,15 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2959,9 +2965,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2972,9 +2978,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2990,15 +2996,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3006,9 +3012,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3019,9 +3025,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3037,15 +3043,15 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3053,9 +3059,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3066,9 +3072,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3084,15 +3090,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3100,9 +3106,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3113,9 +3119,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3131,15 +3137,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3147,9 +3153,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3160,9 +3166,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3178,15 +3184,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3194,8 +3200,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3207,9 +3213,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3228,12 +3234,12 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3241,8 +3247,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3254,9 +3260,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3275,12 +3281,12 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3288,8 +3294,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3301,9 +3307,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3322,12 +3328,12 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3335,8 +3341,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3348,9 +3354,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3366,14 +3372,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_sys( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3381,9 +3388,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3394,9 +3401,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3412,14 +3419,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_cta( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3427,9 +3435,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3440,9 +3448,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3458,14 +3466,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_gpu( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3473,9 +3482,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3486,9 +3495,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3504,14 +3513,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3519,9 +3529,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3532,9 +3542,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3550,14 +3560,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_sys( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3565,9 +3576,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3578,9 +3589,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3596,14 +3607,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_cta( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3611,9 +3623,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3624,9 +3636,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3642,14 +3654,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_gpu( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3657,9 +3670,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3670,9 +3683,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3688,14 +3701,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3703,9 +3717,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3716,9 +3730,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3734,14 +3748,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3749,8 +3764,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3762,9 +3777,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3780,14 +3795,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3795,8 +3811,8 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3808,9 +3824,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3826,14 +3842,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3841,8 +3858,8 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3854,9 +3871,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3872,14 +3889,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3887,8 +3905,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3900,9 +3918,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3921,12 +3939,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3934,9 +3952,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3947,9 +3965,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3965,15 +3983,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3981,9 +3999,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3994,9 +4012,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4012,15 +4030,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4028,9 +4046,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4041,9 +4059,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4059,15 +4077,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4075,9 +4093,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4088,9 +4106,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4106,15 +4124,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_sys( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4122,9 +4140,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4135,9 +4153,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4153,15 +4171,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4169,9 +4187,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4182,33 +4200,33 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB90_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB90_1; ; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4216,9 +4234,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4229,33 +4247,33 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB91_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB91_1; ; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4263,9 +4281,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4276,33 +4294,33 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB92_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB92_1; ; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4310,8 +4328,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4323,33 +4341,33 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB93_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB93_1; ; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4357,8 +4375,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4370,33 +4388,33 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB94_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB94_1; ; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4404,8 +4422,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4417,33 +4435,33 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB95_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB95_1; ; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4451,8 +4469,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4464,33 +4482,33 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB96_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB96_1; ; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4498,9 +4516,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4511,33 +4529,33 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB97_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB97_1; ; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_cta( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4545,9 +4563,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4558,33 +4576,33 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB98_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB98_1; ; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4592,9 +4610,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4605,9 +4623,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4620,18 +4638,18 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB99_1; ; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4639,9 +4657,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4652,33 +4670,33 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB100_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB100_1; ; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_sys( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4686,9 +4704,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4699,33 +4717,33 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB101_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB101_1; ; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_cta( +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4733,9 +4751,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4746,9 +4764,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4761,18 +4779,18 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB102_1; ; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_gpu( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4780,9 +4798,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4793,9 +4811,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4808,18 +4826,18 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB103_1; ; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4827,9 +4845,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4840,33 +4858,33 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB104_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB104_1; ; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4874,8 +4892,8 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4887,9 +4905,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4902,18 +4920,18 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB105_1; ; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4921,8 +4939,8 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4934,9 +4952,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4949,18 +4967,18 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB106_1; ; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4968,8 +4986,8 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4981,9 +4999,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4996,18 +5014,18 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB107_1; ; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5015,8 +5033,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5028,9 +5046,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5043,18 +5061,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB108_1; ; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5062,9 +5080,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5075,9 +5093,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5090,18 +5108,18 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB109_1; ; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5109,9 +5127,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5122,9 +5140,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5137,18 +5155,18 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB110_1; ; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5156,9 +5174,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5169,33 +5187,33 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB111_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB111_1; ; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5203,9 +5221,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5216,9 +5234,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5231,18 +5249,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB112_1; ; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5250,9 +5268,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5263,9 +5281,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5278,18 +5296,18 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB113_1; ; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5297,9 +5315,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5310,33 +5328,33 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB114_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB114_1; ; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5344,9 +5362,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5357,33 +5375,33 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB115_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB115_1; ; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5391,9 +5409,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5404,9 +5422,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5419,18 +5437,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB116_1; ; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5438,8 +5456,8 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5451,33 +5469,33 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB117_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB117_1; ; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5485,8 +5503,8 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5498,33 +5516,33 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB118_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB118_1; ; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5532,8 +5550,8 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5545,33 +5563,33 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB119_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB119_1; ; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5579,8 +5597,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5592,33 +5610,33 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB120_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB120_1; ; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5626,9 +5644,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5639,33 +5657,33 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB121_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB121_1; ; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5673,9 +5691,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5686,33 +5704,33 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB122_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB122_1; ; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5720,9 +5738,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5733,33 +5751,33 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB123_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB123_1; ; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5767,9 +5785,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5780,33 +5798,33 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB124_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB124_1; ; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5814,9 +5832,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5827,33 +5845,33 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB125_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB125_1; ; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5861,9 +5879,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5874,33 +5892,33 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB126_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB126_1; ; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5908,9 +5926,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5921,33 +5939,33 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB127_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB127_1; ; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5955,9 +5973,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5968,33 +5986,33 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB128_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB128_1; ; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6002,8 +6020,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6015,33 +6033,33 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB129_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB129_1; ; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6049,8 +6067,8 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6062,33 +6080,33 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB130_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB130_1; ; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6096,8 +6114,8 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6109,33 +6127,33 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB131_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB131_1; ; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6143,8 +6161,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6156,33 +6174,33 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB132_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB132_1; ; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6190,9 +6208,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6203,33 +6221,33 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB133_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB133_1; ; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6237,9 +6255,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6250,4038 +6268,209 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB134_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: mov.u32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB134_1; ; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB135_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB135_1; ; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB136_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB136_1; ; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB137_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.u32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB137_1; ; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB138_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB138_1; -; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB139_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB139_1; -; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB140_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB140_1; -; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB141_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB141_1; -; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB142_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB142_1; -; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB143_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB143_1; -; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB144_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB144_1; -; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB145_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB145_1; -; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB146_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB146_1; -; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB147_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB147_1; -; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB148_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB148_1; -; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB149_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB149_1; -; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB150_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB150_1; -; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB151_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB151_1; -; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB152_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB152_1; -; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB153_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB153_1; -; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB154_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB154_1; -; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB155_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB155_1; -; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB156_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB156_1; -; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB157_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB157_1; -; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB158_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB158_1; -; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB159_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB159_1; -; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB160_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB160_1; -; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB161_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB161_1; -; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB162_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB162_1; -; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB163_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB163_1; -; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB164_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB164_1; -; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB165_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB165_1; -; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB166_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB166_1; -; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB167_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB167_1; -; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB168_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB168_1; -; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB169_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB169_1; -; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB170_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB170_1; -; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB171_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB171_1; -; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB172_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB172_1; -; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB173_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB173_1; -; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB174_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB174_1; -; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB175_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB175_1; -; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB176_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB176_1; -; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB177_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB177_1; -; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB178_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB178_1; -; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB179_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB179_1; -; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB180_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB180_1; -; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB181_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB181_1; -; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB182_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB182_1; -; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB183_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB183_1; -; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB184_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB184_1; -; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB185_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB185_1; -; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB186_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB186_1; -; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB187_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB187_1; -; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB188_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB188_1; -; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB189_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB189_1; -; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB190_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB190_1; -; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB191_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB191_1; -; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB192_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB192_1; -; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB193_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB193_1; -; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB194_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB194_1; -; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB195_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB195_1; -; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB196_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB196_1; -; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB197_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB197_1; -; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB198_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB198_1; -; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB199_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB199_1; -; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB200_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB200_1; -; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB201_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB201_1; -; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB202_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB202_1; -; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB203_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB203_1; -; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB204_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB204_1; -; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB205_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB205_1; -; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB206_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB206_1; -; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB207_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB207_1; -; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB208_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB208_1; -; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB209_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB209_1; -; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB210_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB210_1; -; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB211_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB211_1; -; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB212_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB212_1; -; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB213_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB213_1; -; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB214_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB214_1; -; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB215_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB215_1; -; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB216_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB216_1; -; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB217_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB217_1; -; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB218_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB218_1; -; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB219_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB219_1; -; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB220_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB220_1; -; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB221_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB221_1; -; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_cta( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10289,10 +6478,10 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10302,31 +6491,30 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: @%p1 bra $L__BB139_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB222_1; -; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_gpu( +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10334,10 +6522,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10347,76 +6535,30 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB223_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB223_1; -; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: @%p1 bra $L__BB140_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB224_1; -; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_sys( +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10424,10 +6566,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10437,31 +6579,30 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: @%p1 bra $L__BB141_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB225_1; -; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_cta( +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10469,10 +6610,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10482,31 +6623,30 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: @%p1 bra $L__BB142_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB226_1; -; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_gpu( +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10514,10 +6654,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10527,76 +6667,30 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB227_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB227_1; -; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: @%p1 bra $L__BB143_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB228_1; -; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_sys( +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10604,10 +6698,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10617,76 +6711,31 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: @%p1 bra $L__BB144_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB229_1; -; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB230_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB230_1; -; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_gpu( +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10694,10 +6743,10 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10707,31 +6756,31 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: @%p1 bra $L__BB145_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB231_1; -; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10739,10 +6788,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10752,31 +6801,31 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: @%p1 bra $L__BB146_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB232_1; -; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_sys( +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10784,10 +6833,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10797,31 +6846,31 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: @%p1 bra $L__BB147_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB233_1; -; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_cta( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10829,10 +6878,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10842,31 +6891,31 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: @%p1 bra $L__BB148_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB234_1; -; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_gpu( +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10874,10 +6923,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10887,31 +6936,31 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: @%p1 bra $L__BB149_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB235_1; -; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10919,10 +6968,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10932,31 +6981,31 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: @%p1 bra $L__BB150_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB236_1; -; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_sys( +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10964,10 +7013,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -10977,31 +7026,31 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: @%p1 bra $L__BB151_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB237_1; -; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_cta( +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11009,10 +7058,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11022,31 +7071,31 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: @%p1 bra $L__BB152_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB238_1; -; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_gpu( +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11054,10 +7103,11 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11067,31 +7117,31 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: @%p1 bra $L__BB153_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB239_1; -; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11099,10 +7149,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11113,31 +7163,31 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: @%p1 bra $L__BB154_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB240_1; -; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_sys( +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11145,10 +7195,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11159,31 +7209,31 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: @%p1 bra $L__BB155_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB241_1; -; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_cta( +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11191,10 +7241,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11205,31 +7255,31 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: @%p1 bra $L__BB156_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB242_1; -; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11237,10 +7287,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11251,31 +7301,31 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: @%p1 bra $L__BB157_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB243_1; -; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11283,10 +7333,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11297,31 +7347,31 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: @%p1 bra $L__BB158_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB244_1; -; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_sys( +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11329,10 +7379,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11343,31 +7393,31 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: @%p1 bra $L__BB159_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB245_1; -; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_cta( +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11375,10 +7425,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11389,31 +7439,31 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: @%p1 bra $L__BB160_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB246_1; -; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11421,10 +7471,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11435,31 +7485,31 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: @%p1 bra $L__BB161_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB247_1; -; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11467,11 +7517,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11481,31 +7530,31 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: @%p1 bra $L__BB162_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB248_1; -; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_sys( +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11513,11 +7562,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11527,31 +7575,31 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: @%p1 bra $L__BB163_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB249_1; -; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_cta( +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11559,11 +7607,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11573,31 +7620,31 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: @%p1 bra $L__BB164_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB250_1; -; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11605,11 +7652,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11619,31 +7665,31 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: @%p1 bra $L__BB165_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB251_1; -; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11651,11 +7697,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11665,30 +7710,31 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: @%p1 bra $L__BB166_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB252_1; -; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_sys( +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11696,11 +7742,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11710,30 +7755,31 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: @%p1 bra $L__BB167_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB253_1; -; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_cta( +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11741,11 +7787,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11755,30 +7800,31 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: @%p1 bra $L__BB168_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB254_1; -; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_gpu( +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11786,11 +7832,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11800,30 +7845,31 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: @%p1 bra $L__BB169_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB255_1; -; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11831,11 +7877,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11845,30 +7890,31 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: @%p1 bra $L__BB170_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB256_1; -; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_sys( +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11876,11 +7922,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11890,30 +7935,31 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: @%p1 bra $L__BB171_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB257_1; -; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } - -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_cta( + +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11921,11 +7967,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11935,30 +7980,31 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: @%p1 bra $L__BB172_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB258_1; -; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_gpu( +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11966,11 +8012,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -11980,30 +8025,31 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: @%p1 bra $L__BB173_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB259_1; -; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12011,11 +8057,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12025,30 +8070,31 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: @%p1 bra $L__BB174_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB260_1; -; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_sys( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12056,11 +8102,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12070,30 +8115,31 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: @%p1 bra $L__BB175_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB261_1; -; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_cta( +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12101,11 +8147,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12115,30 +8160,31 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: @%p1 bra $L__BB176_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB262_1; -; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_gpu( +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12146,11 +8192,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12160,30 +8205,31 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: @%p1 bra $L__BB177_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB263_1; -; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12191,11 +8237,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12205,31 +8250,31 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: @%p1 bra $L__BB178_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB264_1; -; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_sys( +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12237,11 +8282,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -12251,31 +8295,31 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: @%p1 bra $L__BB179_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB265_1; -; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_cta( +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12283,10 +8327,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12297,31 +8341,31 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: @%p1 bra $L__BB180_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB266_1; -; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_gpu( +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12329,10 +8373,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12343,31 +8387,31 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: @%p1 bra $L__BB181_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB267_1; -; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12375,10 +8419,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12389,31 +8433,31 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: @%p1 bra $L__BB182_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB268_1; -; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_sys( +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12421,10 +8465,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12435,31 +8479,31 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: @%p1 bra $L__BB183_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB269_1; -; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_cta( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12467,10 +8511,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12481,31 +8525,31 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB270_3; +; SM60-NEXT: @%p1 bra $L__BB184_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB270_1; -; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_gpu( +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12513,10 +8557,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12527,31 +8571,31 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB271_3; +; SM60-NEXT: @%p1 bra $L__BB185_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB271_1; -; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12559,10 +8603,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12573,31 +8617,31 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB272_3; +; SM60-NEXT: @%p1 bra $L__BB186_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB272_1; -; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_sys( +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12605,10 +8649,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12619,31 +8663,31 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB273_3; +; SM60-NEXT: @%p1 bra $L__BB187_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB273_1; -; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_cta( +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12651,10 +8695,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12665,31 +8709,31 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB274_3; +; SM60-NEXT: @%p1 bra $L__BB188_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB274_1; -; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_gpu( +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12697,10 +8741,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12711,31 +8755,30 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB275_3; +; SM60-NEXT: @%p1 bra $L__BB189_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB275_1; -; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12743,10 +8786,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12757,31 +8800,30 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB276_3; +; SM60-NEXT: @%p1 bra $L__BB190_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB276_1; -; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_sys( +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12789,10 +8831,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12803,31 +8845,30 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB277_3; +; SM60-NEXT: @%p1 bra $L__BB191_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB277_1; -; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_cta( +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12835,10 +8876,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12849,31 +8890,30 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB278_3; +; SM60-NEXT: @%p1 bra $L__BB192_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB278_1; -; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_gpu( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12881,10 +8921,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12895,31 +8935,30 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB279_3; +; SM60-NEXT: @%p1 bra $L__BB193_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB279_1; -; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12927,10 +8966,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12941,31 +8980,30 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB280_3; +; SM60-NEXT: @%p1 bra $L__BB194_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB280_1; -; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_sys( +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12973,10 +9011,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12987,31 +9025,30 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB281_3; +; SM60-NEXT: @%p1 bra $L__BB195_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB281_1; -; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_cta( +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13019,10 +9056,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13033,31 +9070,30 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB282_3; +; SM60-NEXT: @%p1 bra $L__BB196_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB282_1; -; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_gpu( +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13065,10 +9101,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13079,31 +9115,30 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB283_3; +; SM60-NEXT: @%p1 bra $L__BB197_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB283_1; -; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13111,10 +9146,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13125,31 +9160,31 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB284_3; +; SM60-NEXT: @%p1 bra $L__BB198_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB284_1; -; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_sys( +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13157,10 +9192,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13171,31 +9206,31 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB285_3; +; SM60-NEXT: @%p1 bra $L__BB199_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB285_1; -; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_cta( +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13203,10 +9238,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13217,31 +9252,31 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB286_3; +; SM60-NEXT: @%p1 bra $L__BB200_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB286_1; -; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_gpu( +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13249,10 +9284,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13263,31 +9298,31 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB287_3; +; SM60-NEXT: @%p1 bra $L__BB201_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB287_1; -; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13295,10 +9330,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13309,31 +9344,31 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB288_3; +; SM60-NEXT: @%p1 bra $L__BB202_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB288_1; -; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13341,10 +9376,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13355,31 +9390,31 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB289_3; +; SM60-NEXT: @%p1 bra $L__BB203_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB289_1; -; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13387,10 +9422,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13401,31 +9436,31 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB290_3; +; SM60-NEXT: @%p1 bra $L__BB204_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB290_1; -; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13433,10 +9468,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13447,31 +9482,31 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB291_3; +; SM60-NEXT: @%p1 bra $L__BB205_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB291_1; -; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13479,10 +9514,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13493,31 +9528,31 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB292_3; +; SM60-NEXT: @%p1 bra $L__BB206_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB292_1; -; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_sys( +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13525,10 +9560,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13539,31 +9574,31 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB293_3; +; SM60-NEXT: @%p1 bra $L__BB207_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB293_1; -; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_cta( +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13571,10 +9606,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13585,31 +9620,31 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB294_3; +; SM60-NEXT: @%p1 bra $L__BB208_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB294_1; -; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13617,10 +9652,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13631,31 +9666,31 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB295_3; +; SM60-NEXT: @%p1 bra $L__BB209_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB295_1; -; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13663,10 +9698,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13677,31 +9712,31 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB296_3; +; SM60-NEXT: @%p1 bra $L__BB210_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB296_1; -; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13709,10 +9744,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13723,31 +9758,31 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB297_3; +; SM60-NEXT: @%p1 bra $L__BB211_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB297_1; -; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13755,10 +9790,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13769,31 +9804,31 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB298_3; +; SM60-NEXT: @%p1 bra $L__BB212_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB298_1; -; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13801,10 +9836,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13815,31 +9850,31 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB299_3; +; SM60-NEXT: @%p1 bra $L__BB213_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB299_1; -; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13847,10 +9882,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13861,31 +9896,31 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB300_3; +; SM60-NEXT: @%p1 bra $L__BB214_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB300_1; -; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_sys( +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13893,10 +9928,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13907,31 +9942,31 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB301_3; +; SM60-NEXT: @%p1 bra $L__BB215_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB301_1; -; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_cta( +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13939,10 +9974,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13953,31 +9988,31 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB302_3; +; SM60-NEXT: @%p1 bra $L__BB216_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB302_1; -; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -13985,10 +10020,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -13999,31 +10034,31 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB303_3; +; SM60-NEXT: @%p1 bra $L__BB217_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB303_1; -; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14031,10 +10066,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14045,31 +10080,31 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB304_3; +; SM60-NEXT: @%p1 bra $L__BB218_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB304_1; -; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_sys( +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14077,10 +10112,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14091,31 +10126,31 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB305_3; +; SM60-NEXT: @%p1 bra $L__BB219_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB305_1; -; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_cta( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14123,10 +10158,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14137,31 +10172,31 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB306_3; +; SM60-NEXT: @%p1 bra $L__BB220_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB306_1; -; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_gpu( +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14169,10 +10204,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14183,31 +10218,31 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB307_3; +; SM60-NEXT: @%p1 bra $L__BB221_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB307_1; -; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14215,10 +10250,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14229,31 +10264,31 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB308_3; +; SM60-NEXT: @%p1 bra $L__BB222_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB308_1; -; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_sys( +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14261,10 +10296,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14275,31 +10310,31 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB309_3; +; SM60-NEXT: @%p1 bra $L__BB223_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB309_1; -; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_cta( +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14307,10 +10342,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14321,31 +10356,31 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB310_3; +; SM60-NEXT: @%p1 bra $L__BB224_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB310_1; -; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14353,10 +10388,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14367,31 +10402,31 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB311_3; +; SM60-NEXT: @%p1 bra $L__BB225_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB311_1; -; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14399,10 +10434,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14413,31 +10448,31 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB312_3; +; SM60-NEXT: @%p1 bra $L__BB226_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB312_1; -; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14445,10 +10480,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14459,31 +10494,31 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB313_3; +; SM60-NEXT: @%p1 bra $L__BB227_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB313_1; -; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14491,10 +10526,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14505,31 +10540,31 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB314_3; +; SM60-NEXT: @%p1 bra $L__BB228_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB314_1; -; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14537,10 +10572,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14551,31 +10586,31 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB315_3; +; SM60-NEXT: @%p1 bra $L__BB229_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB315_1; -; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14583,10 +10618,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14597,31 +10632,31 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB316_3; +; SM60-NEXT: @%p1 bra $L__BB230_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB316_1; -; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14629,10 +10664,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14643,31 +10678,31 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB317_3; +; SM60-NEXT: @%p1 bra $L__BB231_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB317_1; -; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14675,10 +10710,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14689,31 +10724,31 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB318_3; +; SM60-NEXT: @%p1 bra $L__BB232_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB318_1; -; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14721,10 +10756,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14735,31 +10770,31 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB319_3; +; SM60-NEXT: @%p1 bra $L__BB233_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB319_1; -; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14767,10 +10802,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14781,31 +10816,31 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB320_3; +; SM60-NEXT: @%p1 bra $L__BB234_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB320_1; -; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14813,10 +10848,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14827,31 +10862,31 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB321_3; +; SM60-NEXT: @%p1 bra $L__BB235_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB321_1; -; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14859,10 +10894,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14873,31 +10908,31 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB322_3; +; SM60-NEXT: @%p1 bra $L__BB236_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB322_1; -; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14905,10 +10940,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14919,31 +10954,31 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB323_3; +; SM60-NEXT: @%p1 bra $L__BB237_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB323_1; -; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14951,10 +10986,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -14965,31 +11000,31 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB324_3; +; SM60-NEXT: @%p1 bra $L__BB238_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB324_1; -; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -14997,10 +11032,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15011,31 +11046,31 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB325_3; +; SM60-NEXT: @%p1 bra $L__BB239_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB325_1; -; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15043,10 +11078,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15057,31 +11092,31 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB326_3; +; SM60-NEXT: @%p1 bra $L__BB240_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB326_1; -; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15089,10 +11124,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15103,31 +11138,31 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB327_3; +; SM60-NEXT: @%p1 bra $L__BB241_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB327_1; -; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15135,10 +11170,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15149,31 +11184,31 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB328_3; +; SM60-NEXT: @%p1 bra $L__BB242_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB328_1; -; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_sys( +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15181,10 +11216,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15195,31 +11230,31 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB329_3; +; SM60-NEXT: @%p1 bra $L__BB243_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB329_1; -; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_cta( +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15227,10 +11262,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15241,31 +11276,31 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB330_3; +; SM60-NEXT: @%p1 bra $L__BB244_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB330_1; -; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15273,10 +11308,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15287,31 +11322,31 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB331_3; +; SM60-NEXT: @%p1 bra $L__BB245_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB331_1; -; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15319,10 +11354,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15333,31 +11368,31 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB332_3; +; SM60-NEXT: @%p1 bra $L__BB246_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB332_1; -; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15365,10 +11400,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15379,31 +11414,31 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB333_3; +; SM60-NEXT: @%p1 bra $L__BB247_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB333_1; -; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15411,10 +11446,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15425,31 +11460,31 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB334_3; +; SM60-NEXT: @%p1 bra $L__BB248_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB334_1; -; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15457,10 +11492,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15471,31 +11506,31 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB335_3; +; SM60-NEXT: @%p1 bra $L__BB249_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB335_1; -; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15503,10 +11538,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15517,31 +11552,31 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB336_3; +; SM60-NEXT: @%p1 bra $L__BB250_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB336_1; -; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_sys( +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15549,10 +11584,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15563,31 +11598,31 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB337_3; +; SM60-NEXT: @%p1 bra $L__BB251_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB337_1; -; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_cta( +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15595,10 +11630,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15609,31 +11644,31 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB338_3; +; SM60-NEXT: @%p1 bra $L__BB252_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB338_1; -; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15641,10 +11676,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15655,31 +11690,31 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB339_3; +; SM60-NEXT: @%p1 bra $L__BB253_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB339_1; -; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -15687,10 +11722,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15701,26 +11736,26 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB340_3; +; SM60-NEXT: @%p1 bra $L__BB254_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB340_1; -; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } @@ -15733,10 +11768,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15747,22 +11782,22 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB341_3; +; SM60-NEXT: @%p1 bra $L__BB255_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB341_1; -; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -15779,10 +11814,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15793,22 +11828,22 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB342_3; +; SM60-NEXT: @%p1 bra $L__BB256_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB342_1; -; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -15825,10 +11860,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15839,22 +11874,22 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB343_3; +; SM60-NEXT: @%p1 bra $L__BB257_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB343_1; -; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -15862,52 +11897,6 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB344_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB344_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB344_1; -; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_acquire_i16_shared_sys( ; SM60: { @@ -15917,10 +11906,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15931,22 +11920,22 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB345_3; +; SM60-NEXT: @%p1 bra $L__BB258_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB345_1; -; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -15963,10 +11952,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -15977,22 +11966,22 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB346_3; +; SM60-NEXT: @%p1 bra $L__BB259_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB346_1; -; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16009,10 +11998,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16023,22 +12012,22 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB347_3; +; SM60-NEXT: @%p1 bra $L__BB260_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB347_1; -; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16046,52 +12035,6 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB348_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB348_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB348_1; -; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM60: { @@ -16101,10 +12044,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16115,22 +12058,22 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB349_3; +; SM60-NEXT: @%p1 bra $L__BB261_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB349_1; -; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16147,10 +12090,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16161,22 +12104,22 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB350_3; +; SM60-NEXT: @%p1 bra $L__BB262_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB350_1; -; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16193,10 +12136,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16207,22 +12150,22 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: ld.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB351_3; +; SM60-NEXT: @%p1 bra $L__BB263_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB351_1; -; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16230,52 +12173,6 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB352_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB352_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB352_1; -; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM60: { @@ -16285,10 +12182,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16299,22 +12196,22 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB353_3; +; SM60-NEXT: @%p1 bra $L__BB264_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB353_1; -; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16331,10 +12228,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16345,22 +12242,22 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB354_3; +; SM60-NEXT: @%p1 bra $L__BB265_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB354_1; -; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16377,10 +12274,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16391,22 +12288,22 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: ld.global.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB355_3; +; SM60-NEXT: @%p1 bra $L__BB266_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB355_1; -; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16414,52 +12311,6 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB356_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB356_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB356_1; -; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( ; SM60: { @@ -16469,10 +12320,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16483,22 +12334,22 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB357_3; +; SM60-NEXT: @%p1 bra $L__BB267_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB357_1; -; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16515,10 +12366,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16529,22 +12380,22 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB358_3; +; SM60-NEXT: @%p1 bra $L__BB268_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB358_1; -; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16561,10 +12412,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -16575,22 +12426,22 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB359_3; +; SM60-NEXT: @%p1 bra $L__BB269_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB359_1; -; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -16598,23 +12449,6 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_generic_sys( ; SM60: { @@ -16622,9 +12456,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16639,9 +12473,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16656,9 +12490,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16666,23 +12500,6 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_global_sys( ; SM60: { @@ -16690,9 +12507,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16707,9 +12524,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16724,9 +12541,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16734,23 +12551,6 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_shared_sys( ; SM60: { @@ -16758,9 +12558,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16775,9 +12575,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16792,9 +12592,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16802,23 +12602,6 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_generic_sys( ; SM60: { @@ -16826,9 +12609,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16843,9 +12626,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16860,9 +12643,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16870,23 +12653,6 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_global_sys( ; SM60: { @@ -16894,9 +12660,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16911,9 +12677,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16928,9 +12694,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16938,23 +12704,6 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_shared_sys( ; SM60: { @@ -16962,9 +12711,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16979,9 +12728,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -16996,9 +12745,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17006,24 +12755,6 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM60: { @@ -17031,10 +12762,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17049,10 +12780,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17067,10 +12798,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17078,24 +12809,6 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_global_sys( ; SM60: { @@ -17103,10 +12816,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17121,10 +12834,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17139,10 +12852,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17150,24 +12863,6 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM60: { @@ -17175,10 +12870,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17193,10 +12888,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17211,10 +12906,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17222,23 +12917,6 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_generic_sys( ; SM60: { @@ -17246,9 +12924,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17263,9 +12941,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17280,9 +12958,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17290,23 +12968,6 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_global_sys( ; SM60: { @@ -17314,9 +12975,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17331,9 +12992,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17348,9 +13009,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17358,23 +13019,6 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_shared_sys( ; SM60: { @@ -17382,9 +13026,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17399,9 +13043,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17416,9 +13060,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17426,23 +13070,6 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_generic_sys( ; SM60: { @@ -17450,9 +13077,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17467,9 +13094,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17484,9 +13111,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17494,23 +13121,6 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_global_sys( ; SM60: { @@ -17518,9 +13128,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17535,9 +13145,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17552,9 +13162,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17562,23 +13172,6 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_shared_sys( ; SM60: { @@ -17586,9 +13179,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17603,9 +13196,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17620,9 +13213,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17630,24 +13223,6 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_generic_sys( ; SM60: { @@ -17655,10 +13230,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17673,10 +13248,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17691,10 +13266,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17702,24 +13277,6 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_global_sys( ; SM60: { @@ -17727,10 +13284,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17745,10 +13302,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17763,10 +13320,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17774,24 +13331,6 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_shared_sys( ; SM60: { @@ -17799,10 +13338,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17817,10 +13356,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17835,10 +13374,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17846,23 +13385,6 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_generic_sys( ; SM60: { @@ -17870,9 +13392,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17887,9 +13409,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17904,9 +13426,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -17914,23 +13436,6 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_global_sys( ; SM60: { @@ -17938,64 +13443,47 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic ret i32 %new } - -define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global_gpu( + +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( ; SM60: { ; SM60-NEXT: .reg .b32 %r<4>; ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic ret i32 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_gpu( ; SM60: { ; SM60-NEXT: .reg .b32 %r<4>; ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } @@ -18006,9 +13494,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18023,9 +13511,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18040,9 +13528,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18050,23 +13538,6 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_generic_sys( ; SM60: { @@ -18074,9 +13545,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18091,9 +13562,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18108,9 +13579,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18118,23 +13589,6 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_global_sys( ; SM60: { @@ -18142,9 +13596,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18159,9 +13613,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18176,9 +13630,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18186,23 +13640,6 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_shared_sys( ; SM60: { @@ -18210,9 +13647,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18227,9 +13664,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18244,9 +13681,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18254,24 +13691,6 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_generic_sys( ; SM60: { @@ -18279,10 +13698,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18297,10 +13716,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18315,10 +13734,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18326,24 +13745,6 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_global_sys( ; SM60: { @@ -18351,10 +13752,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18369,10 +13770,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18387,10 +13788,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18398,24 +13799,6 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_shared_sys( ; SM60: { @@ -18423,10 +13806,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18441,10 +13824,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18459,10 +13842,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18470,23 +13853,6 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM60: { @@ -18494,9 +13860,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18511,9 +13877,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18528,9 +13894,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18538,23 +13904,6 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_global_sys( ; SM60: { @@ -18562,9 +13911,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18579,9 +13928,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18596,9 +13945,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18606,23 +13955,6 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM60: { @@ -18630,9 +13962,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18647,9 +13979,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18664,9 +13996,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18674,23 +14006,6 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_generic_sys( ; SM60: { @@ -18698,9 +14013,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18715,9 +14030,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18732,9 +14047,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18742,23 +14057,6 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_global_sys( ; SM60: { @@ -18766,9 +14064,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18783,9 +14081,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18800,9 +14098,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18810,23 +14108,6 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_shared_sys( ; SM60: { @@ -18834,9 +14115,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18851,9 +14132,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18868,9 +14149,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18878,24 +14159,6 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM60: { @@ -18903,10 +14166,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18921,10 +14184,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18939,10 +14202,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18950,24 +14213,6 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM60: { @@ -18975,10 +14220,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -18993,10 +14238,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19011,10 +14256,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19022,24 +14267,6 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM60: { @@ -19047,10 +14274,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19065,10 +14292,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19083,10 +14310,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19094,24 +14321,6 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM60: { @@ -19119,10 +14328,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19137,10 +14346,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19155,10 +14364,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19166,24 +14375,6 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_global_sys( ; SM60: { @@ -19191,10 +14382,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19209,10 +14400,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19227,10 +14418,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19238,24 +14429,6 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM60: { @@ -19263,10 +14436,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19281,10 +14454,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19299,10 +14472,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19310,24 +14483,6 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_generic_sys( ; SM60: { @@ -19335,10 +14490,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19353,10 +14508,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19371,10 +14526,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19382,24 +14537,6 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_global_sys( ; SM60: { @@ -19407,10 +14544,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19425,10 +14562,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19443,32 +14580,14 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } @@ -19479,10 +14598,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19497,10 +14616,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19515,10 +14634,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19526,24 +14645,6 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM60: { @@ -19551,10 +14652,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19569,10 +14670,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19587,10 +14688,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19598,24 +14699,6 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM60: { @@ -19623,10 +14706,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19641,10 +14724,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19659,10 +14742,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19670,24 +14753,6 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM60: { @@ -19695,10 +14760,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19713,10 +14778,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19731,10 +14796,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -19742,31 +14807,15 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19780,9 +14829,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19796,9 +14845,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19806,31 +14855,15 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19844,9 +14877,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19860,9 +14893,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19870,31 +14903,15 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19908,9 +14925,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19924,9 +14941,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19934,31 +14951,15 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19972,9 +14973,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19988,9 +14989,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -19998,31 +14999,15 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20036,9 +15021,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20052,9 +15037,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20062,31 +15047,15 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20100,9 +15069,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20116,9 +15085,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20126,33 +15095,16 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20166,10 +15118,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20183,10 +15135,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20194,33 +15146,16 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20234,10 +15169,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20251,10 +15186,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20262,33 +15197,16 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20302,10 +15220,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20319,10 +15237,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20330,31 +15248,15 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20368,9 +15270,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20384,9 +15286,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20394,31 +15296,15 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20432,9 +15318,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20448,9 +15334,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20458,31 +15344,15 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20496,9 +15366,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20512,9 +15382,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20522,31 +15392,15 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20560,9 +15414,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20576,9 +15430,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20586,31 +15440,15 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20624,9 +15462,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20640,9 +15478,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20650,31 +15488,15 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20688,9 +15510,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20704,9 +15526,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20714,33 +15536,16 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20754,10 +15559,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20771,10 +15576,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20782,33 +15587,16 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20822,10 +15610,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20839,10 +15627,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20850,33 +15638,16 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20890,10 +15661,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20907,10 +15678,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20918,31 +15689,15 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20956,9 +15711,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20972,9 +15727,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -20982,31 +15737,15 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21020,9 +15759,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21036,29 +15775,13 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } @@ -21068,9 +15791,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21084,9 +15807,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21100,9 +15823,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21110,31 +15833,15 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21148,9 +15855,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21164,9 +15871,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21174,31 +15881,15 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21212,9 +15903,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21228,9 +15919,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21238,31 +15929,15 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21276,9 +15951,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21292,9 +15967,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21302,33 +15977,16 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21342,10 +16000,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21359,10 +16017,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21370,33 +16028,16 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21410,10 +16051,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21427,10 +16068,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21438,33 +16079,16 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21478,10 +16102,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21495,10 +16119,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21506,31 +16130,15 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21544,9 +16152,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21560,9 +16168,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21570,31 +16178,15 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21608,9 +16200,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21624,9 +16216,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21634,31 +16226,15 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21672,9 +16248,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21688,9 +16264,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21698,31 +16274,15 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21736,9 +16296,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21752,9 +16312,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21762,31 +16322,15 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21800,9 +16344,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21816,9 +16360,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21826,31 +16370,15 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21864,9 +16392,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21880,9 +16408,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21890,33 +16418,16 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21930,10 +16441,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21947,10 +16458,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21958,33 +16469,16 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -21998,10 +16492,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22015,10 +16509,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22026,33 +16520,16 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22066,10 +16543,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22083,10 +16560,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22094,33 +16571,16 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22134,10 +16594,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22151,10 +16611,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22162,33 +16622,16 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22202,10 +16645,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22219,10 +16662,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22230,33 +16673,16 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22270,10 +16696,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22287,10 +16713,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22298,33 +16724,16 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22338,10 +16747,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22355,10 +16764,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22366,33 +16775,16 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22406,10 +16798,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22423,10 +16815,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22434,33 +16826,16 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22474,10 +16849,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22491,10 +16866,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22502,33 +16877,16 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22542,10 +16900,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22559,10 +16917,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22570,33 +16928,16 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22610,10 +16951,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22627,10 +16968,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22638,33 +16979,16 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22678,10 +17002,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -22695,10 +17019,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 30f3b02b89e77..bae9520da7905 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -446,14 +446,15 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -461,8 +462,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -473,9 +474,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -491,14 +492,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -506,8 +508,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -518,9 +520,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -536,14 +538,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -551,8 +554,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -563,9 +566,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -584,12 +587,12 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -597,8 +600,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -609,9 +612,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -627,15 +630,15 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -643,8 +646,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -655,9 +658,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -673,15 +676,15 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -689,8 +692,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -701,9 +704,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -719,15 +722,15 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -735,8 +738,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -747,9 +750,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -765,15 +768,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -781,8 +784,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -793,9 +796,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -811,15 +814,15 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -827,8 +830,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -839,9 +843,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -857,15 +861,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -873,8 +877,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -885,9 +890,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -903,15 +908,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -919,8 +924,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -931,9 +937,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -949,15 +955,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -965,8 +971,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -977,9 +984,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -998,12 +1005,12 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1011,8 +1018,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1023,9 +1031,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1044,12 +1052,12 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1057,8 +1065,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1069,9 +1078,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1090,12 +1099,12 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1103,8 +1112,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1116,9 +1125,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1137,12 +1146,12 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1150,9 +1159,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1163,9 +1172,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1181,15 +1190,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1197,9 +1206,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1210,9 +1219,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1228,15 +1237,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1244,9 +1253,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1257,9 +1265,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1275,15 +1283,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1291,9 +1299,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1304,9 +1311,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1322,15 +1329,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1338,9 +1345,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1351,9 +1357,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1369,15 +1375,15 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1385,9 +1391,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1398,9 +1403,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1416,15 +1421,15 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1432,9 +1437,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1445,9 +1449,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1463,15 +1467,15 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1479,9 +1483,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1492,9 +1495,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1510,15 +1513,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1526,9 +1529,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1539,9 +1541,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1560,12 +1562,12 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1573,9 +1575,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1586,9 +1587,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1607,12 +1608,12 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1620,9 +1621,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1633,9 +1633,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1654,12 +1654,12 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1667,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1679,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1700,12 +1700,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1713,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1743,15 +1743,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1759,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1789,15 +1789,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1805,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1835,15 +1835,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1851,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1863,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1881,15 +1881,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1897,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1927,15 +1927,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1943,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1973,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1989,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2019,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2035,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2065,15 +2065,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2081,8 +2081,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2094,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2114,12 +2115,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2127,8 +2128,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2141,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2160,12 +2162,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2173,8 +2175,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2188,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2206,12 +2209,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2219,8 +2222,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2231,9 +2235,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2252,12 +2256,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2265,8 +2269,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2277,9 +2282,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2295,15 +2300,15 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2311,8 +2316,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2329,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2341,15 +2347,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2357,8 +2363,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2376,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2387,15 +2394,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2403,8 +2410,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2415,9 +2423,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2433,15 +2441,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2449,8 +2457,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2461,9 +2470,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2479,15 +2488,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2495,8 +2504,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2507,9 +2517,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2525,15 +2535,14 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_gpu( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2541,8 +2550,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2563,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2571,15 +2581,14 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2587,8 +2596,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2599,9 +2609,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2617,15 +2627,14 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_sys( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2633,8 +2642,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2645,9 +2655,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2663,15 +2673,14 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_cta( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2679,8 +2688,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2691,9 +2701,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2709,15 +2719,14 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2725,8 +2734,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2737,9 +2747,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2755,15 +2765,14 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2771,9 +2780,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2784,9 +2793,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2802,15 +2811,14 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2818,9 +2826,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2831,9 +2839,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2849,15 +2857,14 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2865,9 +2872,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2878,9 +2885,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2896,15 +2903,14 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2912,9 +2918,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2925,9 +2931,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2943,15 +2949,15 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -2959,9 +2965,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2972,9 +2978,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2990,15 +2996,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3006,9 +3012,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3019,9 +3025,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3037,15 +3043,15 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3053,9 +3059,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3066,9 +3072,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3084,15 +3090,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3100,9 +3106,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3113,9 +3119,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3131,15 +3137,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3147,9 +3153,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3160,9 +3166,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3178,15 +3184,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3194,9 +3200,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3207,9 +3213,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3228,12 +3234,12 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3241,9 +3247,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3254,9 +3260,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3275,12 +3281,12 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3288,9 +3294,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3301,9 +3307,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3322,12 +3328,12 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3335,9 +3341,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3348,9 +3354,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3366,14 +3372,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_sys( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3381,9 +3388,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3394,9 +3401,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3412,14 +3419,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_cta( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3427,9 +3435,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3440,9 +3448,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3458,14 +3466,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_gpu( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3473,9 +3482,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3486,9 +3495,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3504,14 +3513,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3519,9 +3529,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3532,9 +3542,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3550,14 +3560,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_sys( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3565,9 +3576,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3578,9 +3589,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3596,14 +3607,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_cta( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3611,9 +3623,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3624,9 +3636,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3642,14 +3654,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_gpu( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3657,9 +3670,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3670,9 +3683,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3688,14 +3701,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3703,9 +3717,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3716,9 +3730,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3734,14 +3748,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3749,8 +3764,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3762,9 +3777,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3780,14 +3795,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3795,8 +3811,8 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3808,9 +3824,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3826,14 +3842,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3841,8 +3858,8 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3854,9 +3871,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3872,14 +3889,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3887,8 +3905,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3900,9 +3918,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3921,12 +3939,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3934,9 +3952,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3947,9 +3965,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3965,15 +3983,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -3981,9 +3999,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -3994,9 +4012,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4012,15 +4030,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4028,9 +4046,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4041,9 +4059,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4059,15 +4077,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4075,9 +4093,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4088,9 +4106,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4106,15 +4124,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_sys( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4122,9 +4140,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4135,9 +4153,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4153,15 +4171,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4169,9 +4187,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4182,33 +4200,33 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB90_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB90_1; ; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4216,9 +4234,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4229,33 +4247,33 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB91_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB91_1; ; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4263,9 +4281,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4276,33 +4294,33 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB92_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB92_1; ; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4310,8 +4328,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4323,33 +4341,33 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB93_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB93_1; ; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4357,8 +4375,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4370,33 +4388,33 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB94_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB94_1; ; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4404,8 +4422,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4417,33 +4435,33 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB95_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB95_1; ; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4451,9 +4469,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4464,33 +4482,33 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB96_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB96_1; ; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4498,9 +4516,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4511,33 +4529,33 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB97_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB97_1; ; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_cta( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4545,9 +4563,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4558,33 +4576,33 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB98_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB98_1; ; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4592,9 +4610,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4605,33 +4623,33 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB99_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB99_1; ; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4639,9 +4657,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4652,33 +4670,33 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB100_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB100_1; ; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_sys( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4686,9 +4704,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4699,33 +4717,33 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB101_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB101_1; ; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_cta( +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4733,9 +4751,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4746,33 +4764,33 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB102_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB102_1; ; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_gpu( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4780,9 +4798,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4793,33 +4811,33 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB103_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB103_1; ; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared( +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4827,9 +4845,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -4840,33 +4858,33 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB104_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB104_1; ; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4874,8 +4892,8 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4887,9 +4905,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4902,18 +4920,18 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB105_1; ; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4921,8 +4939,8 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4934,9 +4952,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4949,18 +4967,18 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB106_1; ; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -4968,8 +4986,8 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4981,9 +4999,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4996,18 +5014,18 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB107_1; ; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic( +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5015,9 +5033,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5028,9 +5046,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5043,18 +5061,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB108_1; ; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5062,9 +5080,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5075,33 +5093,33 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB109_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB109_1; ; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5109,9 +5127,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5122,33 +5140,33 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB110_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB110_1; ; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5156,9 +5174,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5169,33 +5187,33 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB111_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB111_1; ; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5203,9 +5221,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5216,33 +5234,33 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB112_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB112_1; ; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5250,9 +5268,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5263,33 +5281,33 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB113_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB113_1; ; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5297,9 +5315,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5310,33 +5328,33 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB114_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB114_1; ; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5344,9 +5362,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5357,33 +5375,33 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB115_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB115_1; ; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared( +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5391,9 +5409,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5404,33 +5422,33 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB116_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB116_1; ; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5438,9 +5456,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5451,33 +5469,33 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB117_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB117_1; ; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5485,9 +5503,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5498,33 +5516,33 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB118_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB118_1; ; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5532,9 +5550,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5545,33 +5563,33 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB119_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB119_1; ; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic( +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5579,9 +5597,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5592,33 +5610,33 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB120_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB120_1; ; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5626,9 +5644,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5639,33 +5657,33 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB121_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB121_1; ; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5673,9 +5691,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5686,33 +5704,33 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB122_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB122_1; ; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5720,9 +5738,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5733,33 +5751,33 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB123_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB123_1; ; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global( +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5767,9 +5785,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5780,33 +5798,33 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB124_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB124_1; ; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5814,9 +5832,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5827,33 +5845,33 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB125_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB125_1; ; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5861,9 +5879,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5874,33 +5892,33 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB126_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB126_1; ; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5908,9 +5926,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5921,33 +5939,33 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB127_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB127_1; ; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared( +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -5955,9 +5973,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -5968,33 +5986,33 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB128_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB128_1; ; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6002,9 +6020,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6015,33 +6033,33 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB129_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB129_1; ; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6049,9 +6067,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6062,33 +6080,33 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB130_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB130_1; ; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6096,9 +6114,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6109,33 +6127,33 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB131_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB131_1; ; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6143,8 +6161,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6156,33 +6174,33 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB132_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB132_1; ; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6190,9 +6208,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6203,33 +6221,33 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB133_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB133_1; ; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -6237,9 +6255,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -6250,4083 +6268,165 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB134_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: mov.u32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB134_1; ; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB135_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB135_1; ; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global( +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB136_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: mov.u32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB136_1; ; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 65535; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB137_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB137_1; -; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB138_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB138_1; -; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB139_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB139_1; -; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB140_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB140_1; -; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB141_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB141_1; -; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB142_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB142_1; -; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB143_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB143_1; -; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB144_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB144_1; -; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB145_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB145_1; -; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB146_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB146_1; -; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB147_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB147_1; -; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB148_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB148_1; -; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB149_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB149_1; -; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB150_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB150_1; -; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB151_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB151_1; -; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB152_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB152_1; -; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB153_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB153_1; -; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB154_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB154_1; -; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB155_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB155_1; -; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB156_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB156_1; -; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB157_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB157_1; -; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB158_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB158_1; -; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB159_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB159_1; -; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB160_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB160_1; -; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB161_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB161_1; -; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB162_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB162_1; -; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB163_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB163_1; -; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB164_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB164_1; -; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB165_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB165_1; -; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB166_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB166_1; -; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB167_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB167_1; -; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB168_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB168_1; -; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB169_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB169_1; -; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB170_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB170_1; -; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB171_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB171_1; -; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB172_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB172_1; -; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB173_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB173_1; -; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB174_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB174_1; -; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB175_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB175_1; -; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB176_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB176_1; -; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB177_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB177_1; -; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB178_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB178_1; -; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB179_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB179_1; -; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB180_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB180_1; -; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB181_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB181_1; -; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB182_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB182_1; -; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB183_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB183_1; -; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB184_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB184_1; -; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB185_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB185_1; -; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB186_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB186_1; -; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB187_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB187_1; -; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB188_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB188_1; -; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB189_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB189_1; -; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB190_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB190_1; -; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB191_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB191_1; -; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB192_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB192_1; -; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB193_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB193_1; -; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB194_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB194_1; -; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB195_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB195_1; -; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB196_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB196_1; -; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB197_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB197_1; -; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB198_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB198_1; -; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB199_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB199_1; -; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB200_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB200_1; -; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB201_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB201_1; -; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB202_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB202_1; -; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB203_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB203_1; -; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB204_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB204_1; -; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB205_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB205_1; -; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB206_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB206_1; -; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB207_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB207_1; -; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB208_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB208_1; -; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB209_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB209_1; -; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB210_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB210_1; -; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB211_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB211_1; -; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB212_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB212_1; -; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB213_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB213_1; -; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB214_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB214_1; -; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB215_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB215_1; -; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB216_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB216_1; -; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB217_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB217_1; -; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB218_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB218_1; -; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB219_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB219_1; -; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB220_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB220_1; -; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB221_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB221_1; -; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB222_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB222_1; -; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB137_1; +; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_gpu( +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10334,10 +6434,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10347,31 +6447,30 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB223_3; +; SM70-NEXT: @%p1 bra $L__BB138_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB223_1; -; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB138_1; +; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10379,10 +6478,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10392,31 +6491,30 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB224_3; +; SM70-NEXT: @%p1 bra $L__BB139_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB224_1; -; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB139_1; +; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_sys( +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10424,10 +6522,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10437,31 +6535,30 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB225_3; +; SM70-NEXT: @%p1 bra $L__BB140_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB225_1; -; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB140_1; +; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_cta( +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10469,10 +6566,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10482,31 +6579,30 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB226_3; +; SM70-NEXT: @%p1 bra $L__BB141_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB226_1; -; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB141_1; +; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_gpu( +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10514,10 +6610,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10527,31 +6623,30 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB227_3; +; SM70-NEXT: @%p1 bra $L__BB142_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB227_1; -; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB142_1; +; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic( +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10559,10 +6654,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10572,31 +6667,30 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB228_3; +; SM70-NEXT: @%p1 bra $L__BB143_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB228_1; -; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB143_1; +; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_sys( +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10604,10 +6698,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10617,31 +6711,31 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB229_3; +; SM70-NEXT: @%p1 bra $L__BB144_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB229_1; -; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB144_1; +; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_cta( +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10649,10 +6743,10 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10662,31 +6756,31 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB230_3; +; SM70-NEXT: @%p1 bra $L__BB145_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB230_1; -; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB145_1; +; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_gpu( +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10694,10 +6788,10 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10707,76 +6801,31 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB231_3; +; SM70-NEXT: @%p1 bra $L__BB146_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB231_1; -; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB146_1; +; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB232_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB232_1; -; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_sys( +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10784,10 +6833,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10797,31 +6846,31 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB233_3; +; SM70-NEXT: @%p1 bra $L__BB147_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB233_1; -; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB147_1; +; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_cta( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10829,10 +6878,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10842,31 +6891,31 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB234_3; +; SM70-NEXT: @%p1 bra $L__BB148_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB234_1; -; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB148_1; +; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_gpu( +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10874,10 +6923,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10887,31 +6936,31 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB235_3; +; SM70-NEXT: @%p1 bra $L__BB149_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB235_1; -; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB149_1; +; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared( +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10919,10 +6968,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10932,31 +6981,31 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB236_3; +; SM70-NEXT: @%p1 bra $L__BB150_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB236_1; -; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB150_1; +; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_sys( +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -10964,10 +7013,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -10977,31 +7026,31 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB237_3; +; SM70-NEXT: @%p1 bra $L__BB151_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB237_1; -; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB151_1; +; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_cta( +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11009,10 +7058,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11022,31 +7071,31 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB238_3; +; SM70-NEXT: @%p1 bra $L__BB152_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB238_1; -; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB152_1; +; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_gpu( +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11054,10 +7103,11 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11067,31 +7117,31 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB239_3; +; SM70-NEXT: @%p1 bra $L__BB153_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB239_1; -; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB153_1; +; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic( +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11099,10 +7149,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11113,31 +7163,31 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB240_3; +; SM70-NEXT: @%p1 bra $L__BB154_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB240_1; -; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB154_1; +; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_sys( +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11145,10 +7195,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11159,31 +7209,31 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB241_3; +; SM70-NEXT: @%p1 bra $L__BB155_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB241_1; -; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB155_1; +; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_cta( +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11191,10 +7241,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11205,31 +7255,31 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB242_3; +; SM70-NEXT: @%p1 bra $L__BB156_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB242_1; -; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB156_1; +; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11237,10 +7287,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11251,31 +7301,31 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB243_3; +; SM70-NEXT: @%p1 bra $L__BB157_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB243_1; -; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB157_1; +; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global( +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11283,10 +7333,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11297,31 +7347,31 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB244_3; +; SM70-NEXT: @%p1 bra $L__BB158_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB244_1; -; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB158_1; +; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_sys( +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11329,10 +7379,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11343,31 +7393,31 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB245_3; +; SM70-NEXT: @%p1 bra $L__BB159_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB245_1; -; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB159_1; +; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_cta( +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11375,10 +7425,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11389,31 +7439,31 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB246_3; +; SM70-NEXT: @%p1 bra $L__BB160_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB246_1; -; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB160_1; +; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_gpu( +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11421,10 +7471,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11435,31 +7485,31 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB247_3; +; SM70-NEXT: @%p1 bra $L__BB161_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB247_1; -; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB161_1; +; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared( +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11467,11 +7517,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11481,31 +7530,31 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB248_3; +; SM70-NEXT: @%p1 bra $L__BB162_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB248_1; -; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB162_1; +; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_sys( +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11513,11 +7562,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11527,31 +7575,31 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB249_3; +; SM70-NEXT: @%p1 bra $L__BB163_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB249_1; -; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB163_1; +; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_cta( +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11559,11 +7607,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11573,31 +7620,31 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB250_3; +; SM70-NEXT: @%p1 bra $L__BB164_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB250_1; -; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB164_1; +; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11605,11 +7652,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11619,31 +7665,31 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB251_3; +; SM70-NEXT: @%p1 bra $L__BB165_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB251_1; -; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB165_1; +; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11651,11 +7697,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11665,30 +7710,31 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB252_3; +; SM70-NEXT: @%p1 bra $L__BB166_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB252_1; -; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB166_1; +; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_sys( +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11696,11 +7742,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11710,30 +7755,31 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB253_3; +; SM70-NEXT: @%p1 bra $L__BB167_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB253_1; -; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB167_1; +; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_cta( +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11741,11 +7787,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11755,30 +7800,31 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB254_3; +; SM70-NEXT: @%p1 bra $L__BB168_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB254_1; -; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB168_1; +; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_gpu( +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11786,11 +7832,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11800,30 +7845,31 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB255_3; +; SM70-NEXT: @%p1 bra $L__BB169_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB255_1; -; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB169_1; +; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global( +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11831,11 +7877,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11845,30 +7890,31 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB256_3; +; SM70-NEXT: @%p1 bra $L__BB170_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB256_1; -; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB170_1; +; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_sys( +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11876,11 +7922,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11890,30 +7935,31 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB257_3; +; SM70-NEXT: @%p1 bra $L__BB171_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB257_1; -; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB171_1; +; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_cta( +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11921,11 +7967,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11935,30 +7980,31 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB258_3; +; SM70-NEXT: @%p1 bra $L__BB172_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB258_1; -; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB172_1; +; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_gpu( +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11966,11 +8012,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -11980,30 +8025,31 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB259_3; +; SM70-NEXT: @%p1 bra $L__BB173_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB259_1; -; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB173_1; +; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared( +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12011,11 +8057,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12025,30 +8070,31 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB260_3; +; SM70-NEXT: @%p1 bra $L__BB174_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB260_1; -; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB174_1; +; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_sys( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12056,11 +8102,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12070,30 +8115,31 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB261_3; +; SM70-NEXT: @%p1 bra $L__BB175_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB261_1; -; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB175_1; +; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_cta( +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12101,11 +8147,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12115,30 +8160,31 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB262_3; +; SM70-NEXT: @%p1 bra $L__BB176_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB262_1; -; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB176_1; +; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_gpu( +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12146,11 +8192,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12160,30 +8205,31 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB263_3; +; SM70-NEXT: @%p1 bra $L__BB177_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB263_1; -; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB177_1; +; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic( +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12191,11 +8237,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12205,31 +8250,31 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB264_3; +; SM70-NEXT: @%p1 bra $L__BB178_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB264_1; -; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB178_1; +; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_sys( +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12237,11 +8282,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -12251,31 +8295,31 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB265_3; +; SM70-NEXT: @%p1 bra $L__BB179_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB265_1; -; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB179_1; +; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_cta( +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12283,10 +8327,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12297,31 +8341,31 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB266_3; +; SM70-NEXT: @%p1 bra $L__BB180_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB266_1; -; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB180_1; +; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_gpu( +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12329,10 +8373,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12343,31 +8387,31 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB267_3; +; SM70-NEXT: @%p1 bra $L__BB181_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB267_1; -; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB181_1; +; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global( +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12375,10 +8419,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12389,31 +8433,31 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB268_3; +; SM70-NEXT: @%p1 bra $L__BB182_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB268_1; -; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB182_1; +; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_sys( +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12421,10 +8465,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12435,31 +8479,31 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB269_3; +; SM70-NEXT: @%p1 bra $L__BB183_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB269_1; -; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB183_1; +; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_cta( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12467,10 +8511,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12481,31 +8525,31 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB270_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB270_3; +; SM70-NEXT: @%p1 bra $L__BB184_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB270_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB270_1; -; SM70-NEXT: $L__BB270_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB184_1; +; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_gpu( +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12513,10 +8557,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12527,31 +8571,31 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB271_3; +; SM70-NEXT: @%p1 bra $L__BB185_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB271_1; -; SM70-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB185_1; +; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared( +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12559,10 +8603,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12573,31 +8617,31 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB272_3; +; SM70-NEXT: @%p1 bra $L__BB186_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB272_1; -; SM70-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB186_1; +; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_sys( +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12605,10 +8649,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12619,31 +8663,31 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB273_3; +; SM70-NEXT: @%p1 bra $L__BB187_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB273_1; -; SM70-NEXT: $L__BB273_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB187_1; +; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_cta( +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12651,10 +8695,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12665,31 +8709,31 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB274_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB274_3; +; SM70-NEXT: @%p1 bra $L__BB188_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB274_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB274_1; -; SM70-NEXT: $L__BB274_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB188_1; +; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_gpu( +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12697,10 +8741,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12711,31 +8755,30 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB275_3; +; SM70-NEXT: @%p1 bra $L__BB189_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB275_1; -; SM70-NEXT: $L__BB275_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB189_1; +; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic( +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12743,10 +8786,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12757,31 +8800,30 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB276_3; +; SM70-NEXT: @%p1 bra $L__BB190_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB276_1; -; SM70-NEXT: $L__BB276_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB190_1; +; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_sys( +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12789,10 +8831,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12803,31 +8845,30 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB277_3; +; SM70-NEXT: @%p1 bra $L__BB191_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB277_1; -; SM70-NEXT: $L__BB277_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB191_1; +; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_cta( +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12835,10 +8876,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12849,31 +8890,30 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB278_3; +; SM70-NEXT: @%p1 bra $L__BB192_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB278_1; -; SM70-NEXT: $L__BB278_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB192_1; +; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_gpu( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12881,10 +8921,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12895,31 +8935,30 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB279_3; +; SM70-NEXT: @%p1 bra $L__BB193_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB279_1; -; SM70-NEXT: $L__BB279_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB193_1; +; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global( +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12927,10 +8966,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12941,31 +8980,30 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB280_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB280_3; +; SM70-NEXT: @%p1 bra $L__BB194_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB280_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB280_1; -; SM70-NEXT: $L__BB280_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB194_1; +; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_sys( +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -12973,10 +9011,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12987,31 +9025,30 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB281_3; +; SM70-NEXT: @%p1 bra $L__BB195_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB281_1; -; SM70-NEXT: $L__BB281_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB195_1; +; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_cta( +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13019,10 +9056,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13033,31 +9070,30 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB282_3; +; SM70-NEXT: @%p1 bra $L__BB196_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB282_1; -; SM70-NEXT: $L__BB282_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB196_1; +; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_gpu( +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13065,10 +9101,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13079,31 +9115,30 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB283_3; +; SM70-NEXT: @%p1 bra $L__BB197_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB283_1; -; SM70-NEXT: $L__BB283_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB197_1; +; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared( +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13111,10 +9146,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13125,31 +9160,31 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB284_3; +; SM70-NEXT: @%p1 bra $L__BB198_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB284_1; -; SM70-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB198_1; +; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_sys( +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13157,10 +9192,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13171,31 +9206,31 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB285_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB285_3; +; SM70-NEXT: @%p1 bra $L__BB199_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB285_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB285_1; -; SM70-NEXT: $L__BB285_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB199_1; +; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_cta( +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13203,10 +9238,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13217,31 +9252,31 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB286_3; +; SM70-NEXT: @%p1 bra $L__BB200_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB286_1; -; SM70-NEXT: $L__BB286_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB200_1; +; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_gpu( +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13249,10 +9284,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13263,31 +9298,31 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB287_3; +; SM70-NEXT: @%p1 bra $L__BB201_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB287_1; -; SM70-NEXT: $L__BB287_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB201_1; +; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13295,10 +9330,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13309,31 +9344,31 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB288_3; +; SM70-NEXT: @%p1 bra $L__BB202_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB288_1; -; SM70-NEXT: $L__BB288_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB202_1; +; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13341,10 +9376,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13355,31 +9390,31 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB289_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB289_3; +; SM70-NEXT: @%p1 bra $L__BB203_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB289_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB289_1; -; SM70-NEXT: $L__BB289_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB203_1; +; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13387,10 +9422,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13401,31 +9436,31 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB290_3; +; SM70-NEXT: @%p1 bra $L__BB204_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB290_1; -; SM70-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB204_1; +; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13433,10 +9468,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13447,31 +9482,31 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB291_3; +; SM70-NEXT: @%p1 bra $L__BB205_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB291_1; -; SM70-NEXT: $L__BB291_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB205_1; +; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global( +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13479,10 +9514,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13493,31 +9528,31 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB292_3; +; SM70-NEXT: @%p1 bra $L__BB206_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB292_1; -; SM70-NEXT: $L__BB292_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB206_1; +; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_sys( +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13525,10 +9560,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13539,31 +9574,31 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB293_3; +; SM70-NEXT: @%p1 bra $L__BB207_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB293_1; -; SM70-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB207_1; +; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_cta( +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13571,10 +9606,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13585,31 +9620,31 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB294_3; +; SM70-NEXT: @%p1 bra $L__BB208_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB294_1; -; SM70-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB208_1; +; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13617,10 +9652,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13631,31 +9666,31 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB295_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB295_3; +; SM70-NEXT: @%p1 bra $L__BB209_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB295_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB295_1; -; SM70-NEXT: $L__BB295_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB209_1; +; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared( +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13663,10 +9698,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13677,31 +9712,31 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB296_3; +; SM70-NEXT: @%p1 bra $L__BB210_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB296_1; -; SM70-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB210_1; +; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13709,10 +9744,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13723,31 +9758,31 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB297_3; +; SM70-NEXT: @%p1 bra $L__BB211_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB297_1; -; SM70-NEXT: $L__BB297_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB211_1; +; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13755,10 +9790,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13769,31 +9804,31 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB298_3; +; SM70-NEXT: @%p1 bra $L__BB212_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB298_1; -; SM70-NEXT: $L__BB298_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB212_1; +; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13801,10 +9836,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13815,31 +9850,31 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB299_3; +; SM70-NEXT: @%p1 bra $L__BB213_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB299_1; -; SM70-NEXT: $L__BB299_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB213_1; +; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic( +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13847,10 +9882,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13861,31 +9896,31 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB300_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB300_3; +; SM70-NEXT: @%p1 bra $L__BB214_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB300_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB300_1; -; SM70-NEXT: $L__BB300_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB214_1; +; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_sys( +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13893,10 +9928,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13907,31 +9942,31 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB301_3; +; SM70-NEXT: @%p1 bra $L__BB215_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB301_1; -; SM70-NEXT: $L__BB301_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB215_1; +; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_cta( +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13939,10 +9974,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13953,31 +9988,31 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB302_3; +; SM70-NEXT: @%p1 bra $L__BB216_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB302_1; -; SM70-NEXT: $L__BB302_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB216_1; +; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -13985,10 +10020,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -13999,31 +10034,31 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB303_3; +; SM70-NEXT: @%p1 bra $L__BB217_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB303_1; -; SM70-NEXT: $L__BB303_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB217_1; +; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global( +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14031,10 +10066,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14045,31 +10080,31 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB304_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB304_3; +; SM70-NEXT: @%p1 bra $L__BB218_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB304_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB304_1; -; SM70-NEXT: $L__BB304_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB218_1; +; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_sys( +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14077,10 +10112,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14091,31 +10126,31 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB305_3; +; SM70-NEXT: @%p1 bra $L__BB219_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB305_1; -; SM70-NEXT: $L__BB305_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB219_1; +; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_cta( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14123,10 +10158,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14137,31 +10172,31 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB306_3; +; SM70-NEXT: @%p1 bra $L__BB220_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB306_1; -; SM70-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB220_1; +; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_gpu( +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14169,10 +10204,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14183,31 +10218,31 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB307_3; +; SM70-NEXT: @%p1 bra $L__BB221_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB307_1; -; SM70-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB221_1; +; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared( +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14215,10 +10250,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14229,31 +10264,31 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB308_3; +; SM70-NEXT: @%p1 bra $L__BB222_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB308_1; -; SM70-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB222_1; +; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_sys( +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14261,10 +10296,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14275,31 +10310,31 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB309_3; +; SM70-NEXT: @%p1 bra $L__BB223_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB309_1; -; SM70-NEXT: $L__BB309_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB223_1; +; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_cta( +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14307,10 +10342,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14321,31 +10356,31 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB310_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB310_3; +; SM70-NEXT: @%p1 bra $L__BB224_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB310_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB310_1; -; SM70-NEXT: $L__BB310_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB224_1; +; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14353,10 +10388,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14367,31 +10402,31 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB311_3; +; SM70-NEXT: @%p1 bra $L__BB225_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB311_1; -; SM70-NEXT: $L__BB311_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB225_1; +; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic( +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14399,10 +10434,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14413,31 +10448,31 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB312_3; +; SM70-NEXT: @%p1 bra $L__BB226_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB312_1; -; SM70-NEXT: $L__BB312_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB226_1; +; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14445,10 +10480,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14459,31 +10494,31 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB313_3; +; SM70-NEXT: @%p1 bra $L__BB227_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB313_1; -; SM70-NEXT: $L__BB313_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB227_1; +; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14491,10 +10526,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14505,31 +10540,31 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB314_3; +; SM70-NEXT: @%p1 bra $L__BB228_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB314_1; -; SM70-NEXT: $L__BB314_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB228_1; +; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14537,10 +10572,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14551,31 +10586,31 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB315_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB315_3; +; SM70-NEXT: @%p1 bra $L__BB229_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB315_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB315_1; -; SM70-NEXT: $L__BB315_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB229_1; +; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global( +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14583,10 +10618,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14597,31 +10632,31 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB316_3; +; SM70-NEXT: @%p1 bra $L__BB230_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB316_1; -; SM70-NEXT: $L__BB316_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB230_1; +; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14629,10 +10664,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14643,31 +10678,31 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB317_3; +; SM70-NEXT: @%p1 bra $L__BB231_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB317_1; -; SM70-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB231_1; +; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14675,10 +10710,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14689,31 +10724,31 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB318_3; +; SM70-NEXT: @%p1 bra $L__BB232_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB318_1; -; SM70-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB232_1; +; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14721,10 +10756,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14735,31 +10770,31 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB319_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB319_3; +; SM70-NEXT: @%p1 bra $L__BB233_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB319_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB319_1; -; SM70-NEXT: $L__BB319_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB233_1; +; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared( +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14767,10 +10802,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14781,31 +10816,31 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB320_3; +; SM70-NEXT: @%p1 bra $L__BB234_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB320_1; -; SM70-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB234_1; +; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14813,10 +10848,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14827,31 +10862,31 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB321_3; +; SM70-NEXT: @%p1 bra $L__BB235_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB321_1; -; SM70-NEXT: $L__BB321_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB235_1; +; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14859,10 +10894,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14873,31 +10908,31 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB322_3; +; SM70-NEXT: @%p1 bra $L__BB236_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB322_1; -; SM70-NEXT: $L__BB322_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB236_1; +; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14905,10 +10940,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14919,31 +10954,31 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB323_3; +; SM70-NEXT: @%p1 bra $L__BB237_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB323_1; -; SM70-NEXT: $L__BB323_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB237_1; +; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14951,10 +10986,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -14965,31 +11000,31 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB324_3; +; SM70-NEXT: @%p1 bra $L__BB238_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB324_1; -; SM70-NEXT: $L__BB324_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB238_1; +; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -14997,10 +11032,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15011,31 +11046,31 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB325_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB325_3; +; SM70-NEXT: @%p1 bra $L__BB239_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB325_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB325_1; -; SM70-NEXT: $L__BB325_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB239_1; +; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15043,10 +11078,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15057,31 +11092,31 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB326_3; +; SM70-NEXT: @%p1 bra $L__BB240_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB326_1; -; SM70-NEXT: $L__BB326_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB240_1; +; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15089,10 +11124,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15103,31 +11138,31 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB327_3; +; SM70-NEXT: @%p1 bra $L__BB241_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB327_1; -; SM70-NEXT: $L__BB327_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB241_1; +; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global( +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15135,10 +11170,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15149,31 +11184,31 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB328_3; +; SM70-NEXT: @%p1 bra $L__BB242_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB328_1; -; SM70-NEXT: $L__BB328_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB242_1; +; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_sys( +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15181,10 +11216,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15195,31 +11230,31 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB329_3; +; SM70-NEXT: @%p1 bra $L__BB243_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB329_1; -; SM70-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB243_1; +; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_cta( +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15227,10 +11262,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15241,31 +11276,31 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB330_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB330_3; +; SM70-NEXT: @%p1 bra $L__BB244_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB330_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB330_1; -; SM70-NEXT: $L__BB330_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB244_1; +; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15273,10 +11308,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15287,31 +11322,31 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB331_3; +; SM70-NEXT: @%p1 bra $L__BB245_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB331_1; -; SM70-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB245_1; +; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared( +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15319,10 +11354,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15333,31 +11368,31 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB332_3; +; SM70-NEXT: @%p1 bra $L__BB246_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB332_1; -; SM70-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB246_1; +; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15365,10 +11400,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15379,31 +11414,31 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB333_3; +; SM70-NEXT: @%p1 bra $L__BB247_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB333_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB333_1; -; SM70-NEXT: $L__BB333_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB247_1; +; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15411,10 +11446,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15425,31 +11460,31 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB334_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB334_3; +; SM70-NEXT: @%p1 bra $L__BB248_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB334_1; -; SM70-NEXT: $L__BB334_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB248_1; +; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15457,10 +11492,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15471,31 +11506,31 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB335_3; +; SM70-NEXT: @%p1 bra $L__BB249_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB335_1; -; SM70-NEXT: $L__BB335_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB249_1; +; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic( +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15503,10 +11538,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15517,31 +11552,31 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB336_3; +; SM70-NEXT: @%p1 bra $L__BB250_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB336_1; -; SM70-NEXT: $L__BB336_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB250_1; +; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_sys( +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15549,10 +11584,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15563,31 +11598,31 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB337_3; +; SM70-NEXT: @%p1 bra $L__BB251_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB337_1; -; SM70-NEXT: $L__BB337_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB251_1; +; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_cta( +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_sys( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15595,10 +11630,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15609,31 +11644,31 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB338_3; +; SM70-NEXT: @%p1 bra $L__BB252_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB338_1; -; SM70-NEXT: $L__BB338_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB252_1; +; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15641,10 +11676,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15655,31 +11690,31 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB339_3; +; SM70-NEXT: @%p1 bra $L__BB253_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB339_1; -; SM70-NEXT: $L__BB339_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB253_1; +; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global( +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -15687,10 +11722,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15701,26 +11736,26 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB340_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB340_3; +; SM70-NEXT: @%p1 bra $L__BB254_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB340_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB340_1; -; SM70-NEXT: $L__BB340_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB254_1; +; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } @@ -15733,10 +11768,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15747,22 +11782,22 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB341_3; +; SM70-NEXT: @%p1 bra $L__BB255_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB341_1; -; SM70-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB255_1; +; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -15779,10 +11814,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15793,22 +11828,22 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB342_3; +; SM70-NEXT: @%p1 bra $L__BB256_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB342_1; -; SM70-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB256_1; +; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -15825,10 +11860,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15839,22 +11874,22 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB343_3; +; SM70-NEXT: @%p1 bra $L__BB257_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB343_1; -; SM70-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB257_1; +; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -15862,52 +11897,6 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB344_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB344_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB344_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB344_1; -; SM70-NEXT: $L__BB344_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_acquire_i16_shared_sys( ; SM70: { @@ -15917,10 +11906,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15931,22 +11920,22 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB345_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB345_3; +; SM70-NEXT: @%p1 bra $L__BB258_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB345_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB345_1; -; SM70-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB258_1; +; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -15963,10 +11952,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -15977,22 +11966,22 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB346_3; +; SM70-NEXT: @%p1 bra $L__BB259_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB346_1; -; SM70-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB259_1; +; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16009,10 +11998,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16023,22 +12012,22 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB347_3; +; SM70-NEXT: @%p1 bra $L__BB260_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB347_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB347_1; -; SM70-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB260_1; +; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16046,52 +12035,6 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB348_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB348_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB348_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB348_1; -; SM70-NEXT: $L__BB348_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM70: { @@ -16101,10 +12044,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16115,22 +12058,22 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB349_3; +; SM70-NEXT: @%p1 bra $L__BB261_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB349_1; -; SM70-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB261_1; +; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16147,10 +12090,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16161,22 +12104,22 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB350_3; +; SM70-NEXT: @%p1 bra $L__BB262_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB350_1; -; SM70-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB262_1; +; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16193,10 +12136,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16207,22 +12150,22 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB351_3; +; SM70-NEXT: @%p1 bra $L__BB263_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB351_1; -; SM70-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB263_1; +; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16230,52 +12173,6 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB352_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB352_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB352_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB352_1; -; SM70-NEXT: $L__BB352_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM70: { @@ -16285,10 +12182,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16299,22 +12196,22 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB353_3; +; SM70-NEXT: @%p1 bra $L__BB264_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB353_1; -; SM70-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB264_1; +; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16331,10 +12228,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16345,22 +12242,22 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB354_3; +; SM70-NEXT: @%p1 bra $L__BB265_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB354_1; -; SM70-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB265_1; +; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16377,10 +12274,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16391,22 +12288,22 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: ld.global.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB355_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB355_3; +; SM70-NEXT: @%p1 bra $L__BB266_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB355_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB355_1; -; SM70-NEXT: $L__BB355_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB266_1; +; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16414,52 +12311,6 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB356_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB356_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB356_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB356_1; -; SM70-NEXT: $L__BB356_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys( ; SM70: { @@ -16469,10 +12320,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16483,22 +12334,22 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB357_3; +; SM70-NEXT: @%p1 bra $L__BB267_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB357_1; -; SM70-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB267_1; +; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16515,10 +12366,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16529,22 +12380,22 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB358_3; +; SM70-NEXT: @%p1 bra $L__BB268_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB358_1; -; SM70-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB268_1; +; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16561,10 +12412,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -16575,22 +12426,22 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB359_3; +; SM70-NEXT: @%p1 bra $L__BB269_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB359_1; -; SM70-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: @%p2 bra $L__BB269_1; +; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; @@ -16598,23 +12449,6 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_generic_sys( ; SM70: { @@ -16622,9 +12456,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16639,9 +12473,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16656,9 +12490,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16666,23 +12500,6 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_global_sys( ; SM70: { @@ -16690,9 +12507,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16707,9 +12524,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16724,9 +12541,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16734,23 +12551,6 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_monotonic_i32_shared_sys( ; SM70: { @@ -16758,9 +12558,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16775,9 +12575,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16792,9 +12592,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16802,23 +12602,6 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_generic_sys( ; SM70: { @@ -16826,9 +12609,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16843,9 +12626,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16860,9 +12643,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16870,23 +12653,6 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_global_sys( ; SM70: { @@ -16894,9 +12660,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16911,9 +12677,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16928,9 +12694,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16938,23 +12704,6 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_acquire_i32_shared_sys( ; SM70: { @@ -16962,9 +12711,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16979,9 +12728,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -16996,9 +12745,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17006,24 +12755,6 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM70: { @@ -17031,10 +12762,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17049,10 +12780,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17067,10 +12798,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17078,24 +12809,6 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_global_sys( ; SM70: { @@ -17103,10 +12816,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17121,10 +12834,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17139,10 +12852,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17150,24 +12863,6 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM70: { @@ -17175,10 +12870,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17193,10 +12888,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17211,10 +12906,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17222,23 +12917,6 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_generic_sys( ; SM70: { @@ -17246,9 +12924,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17263,9 +12941,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17280,9 +12958,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17290,23 +12968,6 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_global_sys( ; SM70: { @@ -17314,9 +12975,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17331,9 +12992,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17348,9 +13009,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17358,23 +13019,6 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_monotonic_i32_shared_sys( ; SM70: { @@ -17382,9 +13026,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17399,9 +13043,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17416,9 +13060,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17426,23 +13070,6 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_generic_sys( ; SM70: { @@ -17450,9 +13077,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17467,9 +13094,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17484,9 +13111,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17494,23 +13121,6 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_global_sys( ; SM70: { @@ -17518,9 +13128,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17535,9 +13145,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17552,9 +13162,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17562,23 +13172,6 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_acquire_i32_shared_sys( ; SM70: { @@ -17586,9 +13179,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17603,9 +13196,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17620,9 +13213,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17630,24 +13223,6 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_generic_sys( ; SM70: { @@ -17655,10 +13230,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17673,10 +13248,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17691,10 +13266,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17702,24 +13277,6 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_global_sys( ; SM70: { @@ -17727,10 +13284,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17745,10 +13302,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17763,10 +13320,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17774,24 +13331,6 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acquire_seq_cst_i32_shared_sys( ; SM70: { @@ -17799,10 +13338,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17817,10 +13356,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17835,10 +13374,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17846,23 +13385,6 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_monotonic_i32_generic_sys( ; SM70: { @@ -17870,9 +13392,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17887,9 +13409,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17904,9 +13426,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17914,23 +13436,6 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_monotonic_i32_global_sys( ; SM70: { @@ -17938,9 +13443,9 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -17949,53 +13454,36 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i } define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global_gpu( +; SM70-LABEL: release_monotonic_i32_global_cta( ; SM70: { ; SM70-NEXT: .reg .b32 %r<4>; ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic ret i32 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared( +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_gpu( ; SM70: { ; SM70-NEXT: .reg .b32 %r<4>; ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } @@ -18006,9 +13494,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18023,9 +13511,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18040,9 +13528,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18050,23 +13538,6 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_generic_sys( ; SM70: { @@ -18074,9 +13545,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18091,9 +13562,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18108,9 +13579,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18118,23 +13589,6 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_global_sys( ; SM70: { @@ -18142,9 +13596,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18159,9 +13613,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18176,9 +13630,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18186,23 +13640,6 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_acquire_i32_shared_sys( ; SM70: { @@ -18210,9 +13647,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18227,9 +13664,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18244,9 +13681,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18254,24 +13691,6 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_generic_sys( ; SM70: { @@ -18279,10 +13698,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18297,10 +13716,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18315,10 +13734,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18326,24 +13745,6 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_global_sys( ; SM70: { @@ -18351,10 +13752,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18369,10 +13770,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18387,10 +13788,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18398,24 +13799,6 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: release_seq_cst_i32_shared_sys( ; SM70: { @@ -18423,10 +13806,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18441,10 +13824,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18459,10 +13842,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18470,23 +13853,6 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM70: { @@ -18494,9 +13860,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18511,9 +13877,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18528,9 +13894,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18538,23 +13904,6 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_global_sys( ; SM70: { @@ -18562,9 +13911,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18579,9 +13928,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18596,9 +13945,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18606,23 +13955,6 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM70: { @@ -18630,9 +13962,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18647,9 +13979,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18664,9 +13996,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18674,23 +14006,6 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_generic_sys( ; SM70: { @@ -18698,9 +14013,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18715,9 +14030,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18732,9 +14047,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18742,23 +14057,6 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_global_sys( ; SM70: { @@ -18766,9 +14064,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18783,9 +14081,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18800,9 +14098,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18810,23 +14108,6 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_acquire_i32_shared_sys( ; SM70: { @@ -18834,9 +14115,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18851,9 +14132,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18868,9 +14149,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18878,24 +14159,6 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM70: { @@ -18903,10 +14166,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18921,10 +14184,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18939,10 +14202,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18950,24 +14213,6 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM70: { @@ -18975,10 +14220,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -18993,10 +14238,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19011,10 +14256,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19022,24 +14267,6 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM70: { @@ -19047,10 +14274,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19065,10 +14292,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19083,10 +14310,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19094,24 +14321,6 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM70: { @@ -19119,10 +14328,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19137,10 +14346,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19155,10 +14364,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19166,24 +14375,6 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_global_sys( ; SM70: { @@ -19191,10 +14382,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19209,10 +14400,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19227,10 +14418,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19238,24 +14429,6 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM70: { @@ -19263,10 +14436,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19281,10 +14454,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19299,10 +14472,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19310,24 +14483,6 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_acquire_i32_generic_sys( ; SM70: { @@ -19335,10 +14490,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19353,10 +14508,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19371,10 +14526,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19382,24 +14537,6 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_acquire_i32_global_sys( ; SM70: { @@ -19407,10 +14544,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19425,10 +14562,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19443,32 +14580,14 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } @@ -19479,10 +14598,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19497,10 +14616,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19515,10 +14634,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19526,24 +14645,6 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM70: { @@ -19551,10 +14652,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19569,10 +14670,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19587,10 +14688,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19598,24 +14699,6 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM70: { @@ -19623,10 +14706,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19641,10 +14724,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19659,10 +14742,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19670,24 +14753,6 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM70: { @@ -19695,10 +14760,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19713,10 +14778,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19731,10 +14796,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -19742,31 +14807,15 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19780,9 +14829,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19796,9 +14845,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19806,31 +14855,15 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19844,9 +14877,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19860,9 +14893,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19870,31 +14903,15 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19908,9 +14925,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19924,9 +14941,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19934,31 +14951,15 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19972,9 +14973,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19988,9 +14989,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -19998,31 +14999,15 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20036,9 +15021,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20052,9 +15037,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20062,31 +15047,15 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20100,9 +15069,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20116,9 +15085,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20126,33 +15095,16 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20166,10 +15118,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20183,10 +15135,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20194,33 +15146,16 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20234,10 +15169,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20251,10 +15186,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20262,33 +15197,16 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20302,10 +15220,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20319,10 +15237,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20330,31 +15248,15 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20368,9 +15270,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20384,9 +15286,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20394,31 +15296,15 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20432,9 +15318,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20448,9 +15334,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20458,31 +15344,15 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20496,9 +15366,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20512,9 +15382,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20522,31 +15392,15 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20560,9 +15414,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20576,9 +15430,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20586,31 +15440,15 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20624,9 +15462,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20640,9 +15478,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20650,31 +15488,15 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20688,9 +15510,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20704,9 +15526,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20714,33 +15536,16 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20754,10 +15559,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20771,10 +15576,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20782,33 +15587,16 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20822,10 +15610,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20839,10 +15627,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20850,33 +15638,16 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acquire_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20890,10 +15661,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20907,10 +15678,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20918,31 +15689,15 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20956,9 +15711,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20972,9 +15727,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -20982,31 +15737,15 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21020,9 +15759,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21036,29 +15775,13 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } @@ -21068,9 +15791,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21084,9 +15807,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21100,9 +15823,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21110,31 +15833,15 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21148,9 +15855,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21164,9 +15871,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21174,31 +15881,15 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21212,9 +15903,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21228,9 +15919,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21238,31 +15929,15 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21276,9 +15951,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21292,9 +15967,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21302,33 +15977,16 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21342,10 +16000,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21359,10 +16017,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21370,33 +16028,16 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21410,10 +16051,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21427,10 +16068,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21438,33 +16079,16 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: release_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21478,10 +16102,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21495,10 +16119,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21506,31 +16130,15 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21544,9 +16152,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21560,9 +16168,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21570,31 +16178,15 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21608,9 +16200,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21624,9 +16216,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21634,31 +16226,15 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21672,9 +16248,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21688,9 +16264,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21698,31 +16274,15 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21736,9 +16296,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21752,9 +16312,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21762,31 +16322,15 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21800,9 +16344,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21816,9 +16360,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21826,31 +16370,15 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21864,9 +16392,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21880,9 +16408,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21890,33 +16418,16 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21930,10 +16441,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21947,10 +16458,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21958,33 +16469,16 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -21998,10 +16492,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22015,10 +16509,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22026,33 +16520,16 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22066,10 +16543,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22083,10 +16560,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22094,33 +16571,16 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22134,10 +16594,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22151,10 +16611,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22162,33 +16622,16 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22202,10 +16645,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22219,10 +16662,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22230,33 +16673,16 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22270,10 +16696,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22287,10 +16713,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22298,33 +16724,16 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22338,10 +16747,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22355,10 +16764,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22366,33 +16775,16 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22406,10 +16798,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22423,10 +16815,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22434,33 +16826,16 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_acquire_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22474,10 +16849,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22491,10 +16866,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22502,33 +16877,16 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22542,10 +16900,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22559,10 +16917,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22570,33 +16928,16 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22610,10 +16951,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22627,10 +16968,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22638,33 +16979,16 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22678,10 +17002,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -22695,10 +17019,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 33366ae25379b..f2ceda7951d45 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cluster( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -448,12 +448,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared( +define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -461,8 +461,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -473,9 +473,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -493,12 +493,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -506,8 +506,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -518,9 +518,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -538,12 +538,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -551,8 +551,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -563,9 +563,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -581,14 +581,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -596,8 +597,8 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -608,9 +609,9 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -626,14 +627,15 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -641,8 +643,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -653,9 +655,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -671,14 +673,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -686,8 +689,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -698,9 +701,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -716,15 +719,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -732,8 +735,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -744,9 +747,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -765,12 +768,12 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -778,8 +781,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -790,9 +793,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -811,12 +814,12 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cluster( +define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -824,8 +827,8 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -836,9 +839,9 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -857,12 +860,12 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -870,8 +873,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -882,9 +885,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -903,12 +906,12 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -916,8 +919,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -928,9 +931,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -949,12 +952,12 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -962,8 +965,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -974,9 +977,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -992,15 +995,15 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1008,8 +1011,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1020,9 +1023,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1038,15 +1041,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cluster( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1054,8 +1057,8 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1066,9 +1069,9 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1084,15 +1087,15 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1100,8 +1103,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1112,9 +1116,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1130,15 +1134,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1146,8 +1150,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1158,9 +1163,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1176,15 +1181,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1192,8 +1197,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1204,9 +1210,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1222,15 +1228,15 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1238,8 +1244,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1250,9 +1257,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1268,15 +1275,15 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cluster( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1284,8 +1291,9 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1296,9 +1304,9 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1314,15 +1322,15 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1330,8 +1338,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1342,9 +1351,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1360,15 +1369,15 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic( +define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1376,9 +1385,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1389,9 +1398,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1407,15 +1416,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1423,9 +1432,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1436,9 +1445,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1454,15 +1463,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1470,9 +1479,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1483,9 +1492,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1501,15 +1510,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1517,9 +1526,9 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1530,9 +1539,9 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1548,15 +1557,15 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1564,9 +1573,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1577,9 +1586,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1595,15 +1604,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1611,9 +1620,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1624,9 +1633,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1642,15 +1651,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1658,9 +1667,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1671,9 +1679,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1692,12 +1700,12 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1705,9 +1713,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1718,9 +1725,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1739,12 +1746,12 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( +define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1752,9 +1759,8 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1765,9 +1771,9 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1786,12 +1792,12 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1799,9 +1805,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1812,9 +1817,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1833,12 +1838,12 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1846,9 +1851,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1859,9 +1863,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1880,12 +1884,12 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1893,9 +1897,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1906,9 +1909,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1924,15 +1927,15 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1940,9 +1943,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1953,9 +1955,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1971,15 +1973,15 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1987,9 +1989,8 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2000,9 +2001,9 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2018,15 +2019,15 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2034,9 +2035,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2065,15 +2065,15 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2081,8 +2081,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2093,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2111,15 +2111,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2127,8 +2127,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2139,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2157,15 +2157,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2173,8 +2173,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2185,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2203,15 +2203,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cluster( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2219,8 +2219,8 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2231,9 +2231,9 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2249,15 +2249,15 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2265,8 +2265,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2277,9 +2277,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2295,15 +2295,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global( +define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2311,8 +2311,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2323,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2341,15 +2341,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2357,8 +2357,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2369,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2387,15 +2387,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2403,8 +2403,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2415,9 +2415,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2433,15 +2433,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cluster( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2449,8 +2449,8 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2461,9 +2461,9 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2479,15 +2479,15 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2495,8 +2495,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2507,9 +2507,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2525,15 +2525,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2541,8 +2541,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2553,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2571,15 +2571,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2587,8 +2587,8 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2599,9 +2599,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2620,12 +2620,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2633,8 +2633,8 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2645,9 +2645,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2666,12 +2666,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cluster( +define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2679,8 +2679,8 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2691,9 +2691,9 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2712,12 +2712,12 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2725,8 +2725,8 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2737,9 +2737,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2758,12 +2758,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2771,8 +2771,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2783,9 +2784,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2804,12 +2805,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2817,8 +2818,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2829,9 +2831,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2847,15 +2849,15 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2863,8 +2865,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2875,9 +2878,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2893,15 +2896,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cluster( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2909,8 +2912,9 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2921,9 +2925,9 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2939,15 +2943,15 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -2955,8 +2959,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2967,9 +2972,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2985,15 +2990,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3001,8 +3006,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3013,9 +3019,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3031,15 +3037,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3047,8 +3053,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3059,9 +3066,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3077,15 +3084,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3093,8 +3100,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3105,9 +3113,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3123,15 +3131,15 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cluster( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3139,8 +3147,9 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3151,9 +3160,9 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3169,15 +3178,15 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_gpu( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3185,8 +3194,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3197,9 +3207,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3215,15 +3225,15 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared( +define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3231,8 +3241,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3243,9 +3254,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3261,15 +3272,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_sys( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3277,8 +3288,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3289,9 +3301,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3307,15 +3319,15 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3323,8 +3335,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3335,9 +3348,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3353,15 +3366,14 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cluster( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3369,8 +3381,9 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3381,9 +3394,9 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3399,15 +3412,14 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3415,8 +3427,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3427,9 +3440,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3445,15 +3458,14 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3461,9 +3473,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3474,9 +3486,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3492,15 +3504,14 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3508,9 +3519,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3521,9 +3532,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3539,15 +3550,14 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3555,9 +3565,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3568,9 +3578,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3586,15 +3596,14 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( +define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3602,9 +3611,9 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3615,9 +3624,9 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3633,15 +3642,14 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3649,9 +3657,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3662,9 +3670,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3680,15 +3688,14 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3696,9 +3703,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3709,9 +3716,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3727,15 +3734,14 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3743,9 +3749,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3756,9 +3762,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3774,15 +3780,14 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3790,9 +3795,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3803,9 +3808,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3821,15 +3826,14 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cluster( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3837,9 +3841,9 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3850,9 +3854,9 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3868,15 +3872,14 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3884,9 +3887,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3897,9 +3900,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3915,15 +3918,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3931,9 +3934,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3944,9 +3947,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3962,15 +3965,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -3978,9 +3981,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -3991,9 +3994,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4009,15 +4012,15 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4025,9 +4028,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4038,9 +4041,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4056,15 +4059,15 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4072,9 +4075,9 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4085,9 +4088,9 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4103,15 +4106,15 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4119,9 +4122,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4132,9 +4135,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4150,15 +4153,15 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic( +define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4166,9 +4169,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4179,32 +4182,33 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB90_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB90_1; ; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_sys( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4212,9 +4216,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4225,32 +4229,33 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB91_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB91_1; ; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cta( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4258,9 +4263,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4271,32 +4276,33 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB92_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB92_1; ; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cluster( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4304,9 +4310,9 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4317,32 +4323,33 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB93_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB93_1; ; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_gpu( +define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4350,9 +4357,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4363,32 +4370,33 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB94_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB94_1; ; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new } -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4396,9 +4404,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4409,32 +4417,33 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB95_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB95_1; ; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_sys( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4442,9 +4451,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4455,32 +4464,33 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB96_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB96_1; ; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cta( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4488,9 +4498,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4501,32 +4511,33 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB97_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB97_1; ; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cluster( +define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4534,9 +4545,9 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4547,32 +4558,33 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB98_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB98_1; ; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_gpu( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4580,9 +4592,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4593,32 +4605,33 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB99_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB99_1; ; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4626,9 +4639,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4639,32 +4652,33 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB100_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB100_1; ; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_sys( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4672,9 +4686,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4685,32 +4699,33 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB101_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB101_1; ; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cta( +define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4718,9 +4733,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4731,32 +4746,33 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB102_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB102_1; ; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cluster( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4764,9 +4780,9 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4777,32 +4793,33 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB103_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB103_1; ; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_gpu( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4810,9 +4827,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4823,32 +4840,33 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB104_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB104_1; ; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4856,9 +4874,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4869,33 +4887,33 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB105_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB105_1; ; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_sys( +define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4903,9 +4921,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4916,33 +4934,33 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB106_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB106_1; ; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cta( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4950,9 +4968,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -4963,33 +4981,33 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB107_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB107_1; ; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cluster( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -4997,9 +5015,9 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5010,33 +5028,33 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB108_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB108_1; ; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5044,9 +5062,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5057,33 +5075,33 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB109_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB109_1; ; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global( +define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5091,9 +5109,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5104,33 +5122,33 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB110_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB110_1; ; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_sys( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5138,9 +5156,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5151,33 +5169,33 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB111_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB111_1; ; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cta( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5185,9 +5203,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5198,33 +5216,33 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB112_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB112_1; ; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cluster( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5232,9 +5250,9 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5245,33 +5263,33 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB113_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB113_1; ; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_gpu( +define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5279,9 +5297,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5292,33 +5310,33 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB114_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB114_1; ; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5326,9 +5344,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5339,33 +5357,33 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB115_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB115_1; ; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5373,8 +5391,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5386,9 +5404,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5401,18 +5419,18 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB116_1; ; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5420,8 +5438,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5433,9 +5451,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5448,18 +5466,18 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB117_1; ; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cluster( +define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5467,8 +5485,8 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5480,9 +5498,9 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5495,18 +5513,18 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB118_1; ; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5514,8 +5532,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5527,9 +5545,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5542,18 +5560,18 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB119_1; ; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5561,9 +5579,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5574,9 +5592,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5589,18 +5607,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB120_1; ; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5608,9 +5626,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5621,43 +5639,43 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB121_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB121_1; ; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cta( -; SM90: { +define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; ; SM90-NEXT: .reg .b32 %r<21>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5668,33 +5686,33 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB122_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB122_1; ; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cluster( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5702,9 +5720,9 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5715,33 +5733,33 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB123_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB123_1; ; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5749,9 +5767,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5762,33 +5780,33 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB124_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB124_1; ; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5796,9 +5814,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5809,33 +5827,33 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB125_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB125_1; ; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_sys( +define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5843,9 +5861,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5856,33 +5874,33 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB126_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB126_1; ; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cta( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5890,9 +5908,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5903,33 +5921,33 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB127_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB127_1; ; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cluster( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5937,9 +5955,9 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5950,33 +5968,33 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB128_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB128_1; ; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_gpu( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -5984,9 +6002,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -5997,33 +6015,33 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB129_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB129_1; ; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared( +define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6031,9 +6049,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6044,33 +6062,33 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB130_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB130_1; ; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_sys( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6078,9 +6096,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6091,33 +6109,33 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB131_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB131_1; ; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6125,9 +6143,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6138,33 +6156,33 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB132_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB132_1; ; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cluster( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6172,9 +6190,9 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6185,33 +6203,33 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB133_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB133_1; ; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6219,9 +6237,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6232,33 +6250,33 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB134_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB134_1; ; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6266,9 +6284,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6279,33 +6297,33 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB135_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB135_1; ; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6313,9 +6331,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6326,33 +6344,33 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB136_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB136_1; ; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6360,9 +6378,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6373,33 +6391,33 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB137_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB137_1; ; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( +define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6407,9 +6425,9 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6420,33 +6438,33 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB138_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB138_1; ; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6454,9 +6472,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6467,33 +6485,33 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB139_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB139_1; ; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global( +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6501,9 +6519,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6514,33 +6532,33 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB140_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB140_1; ; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6548,9 +6566,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6561,33 +6579,33 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB141_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB141_1; ; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6595,9 +6613,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6608,33 +6626,33 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB142_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB142_1; ; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6642,9 +6660,9 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6655,33 +6673,33 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB143_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB143_1; ; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6689,9 +6707,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6702,33 +6720,33 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB144_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB144_1; ; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared( +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6736,9 +6754,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6749,33 +6767,33 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB145_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB145_1; ; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6783,9 +6801,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6796,33 +6814,33 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB146_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB146_1; ; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6830,9 +6848,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6843,33 +6861,33 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB147_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB147_1; ; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6877,9 +6895,9 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6890,33 +6908,33 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB148_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB148_1; ; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6924,9 +6942,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6937,33 +6955,33 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB149_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB149_1; ; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic( +define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -6971,9 +6989,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -6984,33 +7002,33 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB150_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB150_1; ; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7018,9 +7036,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7031,33 +7049,33 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB151_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB151_1; ; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7065,9 +7083,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7078,33 +7096,33 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB152_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB152_1; ; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7112,9 +7130,9 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7125,33 +7143,33 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB153_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB153_1; ; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7159,9 +7177,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7172,33 +7190,33 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB154_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB154_1; ; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global( +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7206,9 +7224,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7219,33 +7237,33 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB155_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB155_1; ; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7253,9 +7271,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7266,33 +7284,33 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB156_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB156_1; ; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7300,9 +7318,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7313,33 +7331,33 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB157_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB157_1; ; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cluster( +define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7347,9 +7365,9 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7360,33 +7378,33 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB158_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB158_1; ; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7394,9 +7412,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7407,33 +7425,33 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB159_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB159_1; ; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared( +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7441,9 +7459,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7454,33 +7472,33 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB160_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB160_1; ; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7488,9 +7506,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7501,33 +7519,33 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB161_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB161_1; ; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7535,9 +7553,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7548,33 +7566,33 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB162_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB162_1; ; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7582,9 +7600,9 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7595,33 +7613,33 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB163_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB163_1; ; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7629,9 +7647,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7642,33 +7660,33 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB164_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB164_1; ; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic( +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7676,9 +7694,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7689,33 +7707,33 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB165_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB165_1; ; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7723,9 +7741,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7736,33 +7754,33 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB166_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB166_1; ; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7770,9 +7788,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7783,33 +7801,33 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB167_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB167_1; ; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7817,9 +7835,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7830,33 +7848,33 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB168_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB168_1; ; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7864,9 +7882,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7877,33 +7895,33 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB169_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB169_1; ; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global( +define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7911,9 +7929,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7924,33 +7942,33 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB170_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB170_1; ; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -7958,9 +7976,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -7971,33 +7989,33 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB171_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB171_1; ; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8005,9 +8023,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8018,33 +8036,33 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB172_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB172_1; ; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8052,9 +8070,9 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8065,33 +8083,33 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB173_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB173_1; ; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( +define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8099,9 +8117,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8112,33 +8130,33 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB174_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB174_1; ; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared( +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8146,9 +8164,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -8159,33 +8177,33 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB175_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB175_1; ; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8193,8 +8211,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8206,9 +8224,9 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8221,18 +8239,18 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB176_1; ; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8240,8 +8258,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8253,9 +8271,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8268,18 +8286,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB177_1; ; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( +define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8287,8 +8305,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8300,9 +8318,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8315,18 +8333,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB178_1; ; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -8334,8 +8352,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8347,9 +8365,9 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8362,2276 +8380,117 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: mov.u32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB179_1; ; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r13; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic( +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_sys( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB180_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB180_1; ; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new } -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 65535; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB181_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: mov.u32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB181_1; ; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( +define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB182_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB182_1; -; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB183_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB183_1; -; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB184_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB184_1; -; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB185_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB185_1; -; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB186_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB186_1; -; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB187_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB187_1; -; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB188_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB188_1; -; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB189_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB189_1; -; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB190_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB190_1; -; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB191_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB191_1; -; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB192_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB192_1; -; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB193_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB193_1; -; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB194_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB194_1; -; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB195_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB195_1; -; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB196_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB196_1; -; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB197_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB197_1; -; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB198_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB198_1; -; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB199_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB199_1; -; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB200_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB200_1; -; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB201_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB201_1; -; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB202_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB202_1; -; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB203_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB203_1; -; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB204_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB204_1; -; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB205_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB205_1; -; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB206_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB206_1; -; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB207_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB207_1; -; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB208_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB208_1; -; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB209_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB209_1; -; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB210_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB210_1; -; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB211_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB211_1; -; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB212_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB212_1; -; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB213_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB213_1; -; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB214_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB214_1; -; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB215_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB215_1; -; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB216_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB216_1; -; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB217_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB217_1; -; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB218_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB218_1; -; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB219_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB219_1; -; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB220_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB220_1; -; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB221_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB221_1; -; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB222_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB222_1; -; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB223_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB223_1; -; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB224_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB224_1; -; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB225_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB225_1; -; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB226_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB226_1; -; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB227_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB227_1; -; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10641,74 +8500,30 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB228_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB228_1; -; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB229_3; +; SM90-NEXT: @%p1 bra $L__BB182_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB229_1; -; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB182_1; +; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global( +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -10716,10 +8531,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10729,25 +8544,25 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB230_3; +; SM90-NEXT: @%p1 bra $L__BB183_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB230_1; -; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB183_1; +; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } @@ -10760,10 +8575,10 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10773,22 +8588,22 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB231_3; +; SM90-NEXT: @%p1 bra $L__BB184_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB231_1; -; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB184_1; +; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -10804,10 +8619,10 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10817,22 +8632,22 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB232_3; +; SM90-NEXT: @%p1 bra $L__BB185_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB232_1; -; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB185_1; +; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -10848,10 +8663,10 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10861,22 +8676,22 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB233_3; +; SM90-NEXT: @%p1 bra $L__BB186_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB233_1; -; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB186_1; +; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic @@ -10892,10 +8707,10 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10905,72 +8720,28 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB234_3; +; SM90-NEXT: @%p1 bra $L__BB187_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB234_1; -; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB187_1; +; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB235_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB235_1; -; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_monotonic_i16_shared_sys( ; SM90: { @@ -10980,10 +8751,10 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10993,22 +8764,22 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB236_3; +; SM90-NEXT: @%p1 bra $L__BB188_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB236_1; -; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB188_1; +; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic @@ -11024,10 +8795,10 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11037,22 +8808,22 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB237_3; +; SM90-NEXT: @%p1 bra $L__BB189_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB237_1; -; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB189_1; +; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic @@ -11068,10 +8839,10 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11081,22 +8852,22 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB238_3; +; SM90-NEXT: @%p1 bra $L__BB190_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB238_1; -; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB190_1; +; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic @@ -11112,10 +8883,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11125,73 +8896,28 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB239_3; +; SM90-NEXT: @%p1 bra $L__BB191_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB239_1; -; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB191_1; +; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic ret i16 %new } -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB240_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB240_1; -; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_acquire_i16_generic_sys( ; SM90: { @@ -11201,10 +8927,10 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11214,22 +8940,22 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB241_3; +; SM90-NEXT: @%p1 bra $L__BB192_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB241_1; -; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB192_1; +; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11246,10 +8972,10 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11259,22 +8985,22 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB242_3; +; SM90-NEXT: @%p1 bra $L__BB193_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB242_1; -; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB193_1; +; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11291,10 +9017,10 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11304,22 +9030,22 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB243_3; +; SM90-NEXT: @%p1 bra $L__BB194_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB243_1; -; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB194_1; +; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11328,52 +9054,7 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) } define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB244_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB244_1; -; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global( +; SM90-LABEL: monotonic_acquire_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11381,10 +9062,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11394,26 +9075,26 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB245_3; +; SM90-NEXT: @%p1 bra $L__BB195_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB245_1; -; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB195_1; +; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire ret i16 %new } @@ -11426,10 +9107,10 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11439,22 +9120,22 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB246_3; +; SM90-NEXT: @%p1 bra $L__BB196_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB246_1; -; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB196_1; +; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11471,10 +9152,10 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11484,22 +9165,22 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB247_3; +; SM90-NEXT: @%p1 bra $L__BB197_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB247_1; -; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB197_1; +; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11516,10 +9197,10 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11529,22 +9210,22 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB248_3; +; SM90-NEXT: @%p1 bra $L__BB198_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB248_1; -; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB198_1; +; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11561,10 +9242,10 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11574,22 +9255,22 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB249_3; +; SM90-NEXT: @%p1 bra $L__BB199_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB249_1; -; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB199_1; +; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11597,51 +9278,6 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB250_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB250_1; -; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_acquire_i16_shared_sys( ; SM90: { @@ -11651,10 +9287,10 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11664,22 +9300,22 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB251_3; +; SM90-NEXT: @%p1 bra $L__BB200_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB251_1; -; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB200_1; +; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11696,10 +9332,10 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11709,22 +9345,22 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB252_3; +; SM90-NEXT: @%p1 bra $L__BB201_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB252_1; -; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB201_1; +; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11741,10 +9377,10 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11754,22 +9390,22 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB253_3; +; SM90-NEXT: @%p1 bra $L__BB202_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB253_1; -; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB202_1; +; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11786,10 +9422,10 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11799,22 +9435,22 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB254_3; +; SM90-NEXT: @%p1 bra $L__BB203_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB254_1; -; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB203_1; +; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11822,52 +9458,6 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB255_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB255_1; -; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( ; SM90: { @@ -11877,10 +9467,10 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11891,22 +9481,22 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB256_3; +; SM90-NEXT: @%p1 bra $L__BB204_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB256_1; -; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB204_1; +; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11923,10 +9513,10 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11937,22 +9527,22 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB257_3; +; SM90-NEXT: @%p1 bra $L__BB205_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB257_1; -; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB205_1; +; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -11969,10 +9559,10 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11983,22 +9573,22 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB258_3; +; SM90-NEXT: @%p1 bra $L__BB206_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB258_1; -; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB206_1; +; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12015,56 +9605,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB259_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB259_1; -; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12075,26 +9619,26 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB260_3; +; SM90-NEXT: @%p1 bra $L__BB207_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB260_1; -; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB207_1; +; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst ret i16 %new } @@ -12107,10 +9651,10 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12121,22 +9665,22 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB261_3; +; SM90-NEXT: @%p1 bra $L__BB208_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB261_1; -; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB208_1; +; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12153,10 +9697,10 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12167,22 +9711,22 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB262_3; +; SM90-NEXT: @%p1 bra $L__BB209_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB262_1; -; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB209_1; +; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12199,10 +9743,10 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12213,22 +9757,22 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB263_3; +; SM90-NEXT: @%p1 bra $L__BB210_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB263_1; -; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB210_1; +; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12245,10 +9789,10 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12259,22 +9803,22 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB264_3; +; SM90-NEXT: @%p1 bra $L__BB211_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB264_1; -; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB211_1; +; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12282,52 +9826,6 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB265_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB265_1; -; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( ; SM90: { @@ -12337,10 +9835,10 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12351,22 +9849,22 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB266_3; +; SM90-NEXT: @%p1 bra $L__BB212_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB266_1; -; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB212_1; +; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12383,10 +9881,10 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12397,22 +9895,22 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB267_3; +; SM90-NEXT: @%p1 bra $L__BB213_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB267_1; -; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB213_1; +; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12429,10 +9927,10 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12443,22 +9941,22 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB268_3; +; SM90-NEXT: @%p1 bra $L__BB214_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB268_1; -; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB214_1; +; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12475,10 +9973,10 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12489,22 +9987,22 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB269_3; +; SM90-NEXT: @%p1 bra $L__BB215_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB269_1; -; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB215_1; +; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12512,51 +10010,6 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB270_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB270_1; -; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_monotonic_i16_generic_sys( ; SM90: { @@ -12566,10 +10019,10 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12579,22 +10032,22 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB271_3; +; SM90-NEXT: @%p1 bra $L__BB216_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB271_1; -; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB216_1; +; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12611,10 +10064,10 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12624,22 +10077,22 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB272_3; +; SM90-NEXT: @%p1 bra $L__BB217_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB272_1; -; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB217_1; +; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12656,10 +10109,10 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12669,22 +10122,22 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB273_3; +; SM90-NEXT: @%p1 bra $L__BB218_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB273_1; -; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB218_1; +; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12692,53 +10145,8 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ret i16 %new } -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB274_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB274_1; -; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global( +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -12746,10 +10154,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12759,26 +10167,26 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB275_3; +; SM90-NEXT: @%p1 bra $L__BB219_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB275_1; -; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB219_1; +; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } @@ -12791,10 +10199,10 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12804,22 +10212,22 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB276_3; +; SM90-NEXT: @%p1 bra $L__BB220_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB276_1; -; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB220_1; +; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12836,10 +10244,10 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12849,22 +10257,22 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB277_3; +; SM90-NEXT: @%p1 bra $L__BB221_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB277_1; -; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB221_1; +; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12881,10 +10289,10 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12894,22 +10302,22 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB278_3; +; SM90-NEXT: @%p1 bra $L__BB222_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB278_1; -; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB222_1; +; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12926,10 +10334,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -12939,22 +10347,22 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB279_3; +; SM90-NEXT: @%p1 bra $L__BB223_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB279_1; -; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB223_1; +; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -12962,51 +10370,6 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB280_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB280_1; -; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_monotonic_i16_shared_sys( ; SM90: { @@ -13016,10 +10379,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13029,22 +10392,22 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB281_3; +; SM90-NEXT: @%p1 bra $L__BB224_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB281_1; -; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB224_1; +; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13061,10 +10424,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13074,22 +10437,22 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB282_3; +; SM90-NEXT: @%p1 bra $L__BB225_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB282_1; -; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB225_1; +; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13106,10 +10469,10 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13119,22 +10482,22 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB283_3; +; SM90-NEXT: @%p1 bra $L__BB226_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB283_1; -; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB226_1; +; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13151,10 +10514,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13164,22 +10527,22 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB284_3; +; SM90-NEXT: @%p1 bra $L__BB227_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB284_1; -; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB227_1; +; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13187,51 +10550,6 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB285_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB285_1; -; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_acquire_i16_generic_sys( ; SM90: { @@ -13241,10 +10559,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13254,22 +10572,22 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB286_3; +; SM90-NEXT: @%p1 bra $L__BB228_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB286_1; -; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB228_1; +; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13286,10 +10604,10 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13299,22 +10617,22 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB287_3; +; SM90-NEXT: @%p1 bra $L__BB229_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB287_1; -; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB229_1; +; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13331,10 +10649,10 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13344,22 +10662,22 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB288_3; +; SM90-NEXT: @%p1 bra $L__BB230_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB288_1; -; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB230_1; +; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13367,53 +10685,8 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB289_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB289_1; -; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global( +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -13421,10 +10694,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13434,26 +10707,26 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB290_3; +; SM90-NEXT: @%p1 bra $L__BB231_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB290_1; -; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB231_1; +; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } @@ -13466,10 +10739,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13479,22 +10752,22 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB291_3; +; SM90-NEXT: @%p1 bra $L__BB232_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB291_1; -; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB232_1; +; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13511,10 +10784,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13524,22 +10797,22 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB292_3; +; SM90-NEXT: @%p1 bra $L__BB233_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB292_1; -; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB233_1; +; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13556,10 +10829,10 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13569,22 +10842,22 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB293_3; +; SM90-NEXT: @%p1 bra $L__BB234_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB293_1; -; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB234_1; +; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13601,10 +10874,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13614,22 +10887,22 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB294_3; +; SM90-NEXT: @%p1 bra $L__BB235_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB294_1; -; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB235_1; +; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13637,51 +10910,6 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB295_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB295_1; -; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_acquire_i16_shared_sys( ; SM90: { @@ -13691,10 +10919,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13704,22 +10932,22 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB296_3; +; SM90-NEXT: @%p1 bra $L__BB236_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB296_1; -; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB236_1; +; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13736,10 +10964,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13749,22 +10977,22 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB297_3; +; SM90-NEXT: @%p1 bra $L__BB237_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB297_1; -; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB237_1; +; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13781,10 +11009,10 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13794,22 +11022,22 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB298_3; +; SM90-NEXT: @%p1 bra $L__BB238_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB298_1; -; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB238_1; +; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13826,10 +11054,10 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -13839,22 +11067,22 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB299_3; +; SM90-NEXT: @%p1 bra $L__BB239_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB299_1; -; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB239_1; +; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13862,52 +11090,6 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB300_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB300_1; -; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_seq_cst_i16_generic_sys( ; SM90: { @@ -13917,10 +11099,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13931,22 +11113,22 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB301_3; +; SM90-NEXT: @%p1 bra $L__BB240_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB301_1; -; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB240_1; +; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -13963,10 +11145,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13977,22 +11159,22 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB302_3; +; SM90-NEXT: @%p1 bra $L__BB241_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB302_1; -; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB241_1; +; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14009,10 +11191,10 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14023,22 +11205,22 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB303_3; +; SM90-NEXT: @%p1 bra $L__BB242_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB303_1; -; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB242_1; +; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14055,56 +11237,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB304_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB304_1; -; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14115,26 +11251,26 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB305_3; +; SM90-NEXT: @%p1 bra $L__BB243_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB305_1; -; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB243_1; +; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } @@ -14147,10 +11283,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14161,22 +11297,22 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB306_3; +; SM90-NEXT: @%p1 bra $L__BB244_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB306_1; -; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB244_1; +; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14193,10 +11329,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14207,22 +11343,22 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB307_3; +; SM90-NEXT: @%p1 bra $L__BB245_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB307_1; -; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB245_1; +; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14239,10 +11375,10 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14253,22 +11389,22 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB308_3; +; SM90-NEXT: @%p1 bra $L__BB246_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB308_1; -; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB246_1; +; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14285,10 +11421,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14299,22 +11435,22 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB309_3; +; SM90-NEXT: @%p1 bra $L__BB247_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB309_1; -; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB247_1; +; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14322,52 +11458,6 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB310_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB310_1; -; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acquire_seq_cst_i16_shared_sys( ; SM90: { @@ -14377,10 +11467,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14391,22 +11481,22 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB311_3; +; SM90-NEXT: @%p1 bra $L__BB248_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB311_1; -; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB248_1; +; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14423,10 +11513,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14437,22 +11527,22 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB312_3; +; SM90-NEXT: @%p1 bra $L__BB249_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB312_1; -; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB249_1; +; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14469,10 +11559,10 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14483,22 +11573,22 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB313_3; +; SM90-NEXT: @%p1 bra $L__BB250_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB313_1; -; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB250_1; +; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14515,10 +11605,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14529,22 +11619,22 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB314_3; +; SM90-NEXT: @%p1 bra $L__BB251_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB314_1; -; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB251_1; +; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -14552,51 +11642,6 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB315_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB315_1; -; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_monotonic_i16_generic_sys( ; SM90: { @@ -14606,10 +11651,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14620,22 +11665,22 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB316_3; +; SM90-NEXT: @%p1 bra $L__BB252_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB316_1; -; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB252_1; +; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -14651,10 +11696,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14665,22 +11710,22 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB317_3; +; SM90-NEXT: @%p1 bra $L__BB253_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB317_1; -; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB253_1; +; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -14696,10 +11741,10 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14710,22 +11755,22 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB318_3; +; SM90-NEXT: @%p1 bra $L__BB254_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB318_1; -; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB254_1; +; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic @@ -14733,52 +11778,7 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) } define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB319_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB319_1; -; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global( +; SM90-LABEL: release_monotonic_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -14786,10 +11786,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14800,25 +11800,25 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB320_3; +; SM90-NEXT: @%p1 bra $L__BB255_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB320_1; -; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB255_1; +; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } @@ -14831,10 +11831,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14845,22 +11845,22 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB321_3; +; SM90-NEXT: @%p1 bra $L__BB256_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB321_1; -; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB256_1; +; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -14876,10 +11876,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14890,22 +11890,22 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB322_3; +; SM90-NEXT: @%p1 bra $L__BB257_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB322_1; -; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB257_1; +; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -14921,10 +11921,10 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14935,22 +11935,22 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB323_3; +; SM90-NEXT: @%p1 bra $L__BB258_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB323_1; -; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB258_1; +; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic @@ -14966,10 +11966,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14980,73 +11980,28 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB324_3; +; SM90-NEXT: @%p1 bra $L__BB259_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB324_1; -; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB259_1; +; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB325_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB325_1; -; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_monotonic_i16_shared_sys( ; SM90: { @@ -15056,10 +12011,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15070,22 +12025,22 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB326_3; +; SM90-NEXT: @%p1 bra $L__BB260_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB326_1; -; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB260_1; +; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic @@ -15101,10 +12056,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15115,22 +12070,22 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB327_3; +; SM90-NEXT: @%p1 bra $L__BB261_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB327_1; -; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB261_1; +; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic @@ -15146,10 +12101,10 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15160,22 +12115,22 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB328_3; +; SM90-NEXT: @%p1 bra $L__BB262_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB328_1; -; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB262_1; +; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic @@ -15191,10 +12146,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15205,74 +12160,28 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB329_3; +; SM90-NEXT: @%p1 bra $L__BB263_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB329_1; -; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB263_1; +; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB330_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB330_1; -; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_acquire_i16_generic_sys( ; SM90: { @@ -15282,10 +12191,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15296,22 +12205,22 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB331_3; +; SM90-NEXT: @%p1 bra $L__BB264_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB331_1; -; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB264_1; +; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15328,10 +12237,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15342,22 +12251,22 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB332_3; +; SM90-NEXT: @%p1 bra $L__BB265_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB332_1; -; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB265_1; +; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15374,10 +12283,10 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15388,77 +12297,31 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB333_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB333_1; -; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB334_3; +; SM90-NEXT: @%p1 bra $L__BB266_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB334_1; -; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB266_1; +; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire ret i16 %new } -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global( +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -15466,10 +12329,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15480,26 +12343,26 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB335_3; +; SM90-NEXT: @%p1 bra $L__BB267_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB335_1; -; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB267_1; +; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } @@ -15512,10 +12375,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15526,22 +12389,22 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB336_3; +; SM90-NEXT: @%p1 bra $L__BB268_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB336_1; -; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB268_1; +; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15558,10 +12421,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15572,22 +12435,22 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB337_3; +; SM90-NEXT: @%p1 bra $L__BB269_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB337_1; -; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB269_1; +; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15604,10 +12467,10 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15618,22 +12481,22 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB338_3; +; SM90-NEXT: @%p1 bra $L__BB270_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB338_1; -; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB270_1; +; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15650,10 +12513,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15664,22 +12527,22 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB339_3; +; SM90-NEXT: @%p1 bra $L__BB271_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB339_1; -; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB271_1; +; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15687,52 +12550,6 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB340_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB340_1; -; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_acquire_i16_shared_sys( ; SM90: { @@ -15742,10 +12559,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15756,22 +12573,22 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB341_3; +; SM90-NEXT: @%p1 bra $L__BB272_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB341_1; -; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB272_1; +; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15788,10 +12605,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15802,22 +12619,22 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB342_3; +; SM90-NEXT: @%p1 bra $L__BB273_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB342_1; -; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB273_1; +; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15834,10 +12651,10 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15848,22 +12665,22 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB343_3; +; SM90-NEXT: @%p1 bra $L__BB274_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB343_1; -; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB274_1; +; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15880,10 +12697,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15894,22 +12711,22 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB344_3; +; SM90-NEXT: @%p1 bra $L__BB275_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB344_1; -; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB275_1; +; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -15917,52 +12734,6 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB345_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB345_1; -; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_seq_cst_i16_generic_sys( ; SM90: { @@ -15972,10 +12743,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15986,22 +12757,22 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB346_3; +; SM90-NEXT: @%p1 bra $L__BB276_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB346_1; -; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB276_1; +; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16018,10 +12789,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16032,77 +12803,31 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB347_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB347_1; -; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB348_3; +; SM90-NEXT: @%p1 bra $L__BB277_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB348_1; -; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB277_1; +; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_gpu( +define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -16110,10 +12835,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16124,31 +12849,31 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB349_3; +; SM90-NEXT: @%p1 bra $L__BB278_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB349_1; -; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB278_1; +; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst ret i16 %new } -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global( +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -16156,10 +12881,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16170,26 +12895,26 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB350_3; +; SM90-NEXT: @%p1 bra $L__BB279_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB350_1; -; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB279_1; +; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } @@ -16202,10 +12927,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16216,22 +12941,22 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB351_3; +; SM90-NEXT: @%p1 bra $L__BB280_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB351_1; -; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB280_1; +; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16248,10 +12973,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16262,22 +12987,22 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB352_3; +; SM90-NEXT: @%p1 bra $L__BB281_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB352_1; -; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB281_1; +; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16294,10 +13019,10 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16308,22 +13033,22 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB353_3; +; SM90-NEXT: @%p1 bra $L__BB282_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB353_1; -; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB282_1; +; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16340,10 +13065,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16354,22 +13079,22 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB354_3; +; SM90-NEXT: @%p1 bra $L__BB283_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB354_1; -; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB283_1; +; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16377,52 +13102,6 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB355_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB355_1; -; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: release_seq_cst_i16_shared_sys( ; SM90: { @@ -16432,10 +13111,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16446,22 +13125,22 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB356_3; +; SM90-NEXT: @%p1 bra $L__BB284_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB356_1; -; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB284_1; +; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16478,10 +13157,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16492,22 +13171,22 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB357_3; +; SM90-NEXT: @%p1 bra $L__BB285_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB357_1; -; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB285_1; +; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16524,10 +13203,10 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16538,22 +13217,22 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB358_3; +; SM90-NEXT: @%p1 bra $L__BB286_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB358_1; -; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB286_1; +; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16570,10 +13249,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16584,22 +13263,22 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB359_3; +; SM90-NEXT: @%p1 bra $L__BB287_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB359_1; -; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB287_1; +; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16607,52 +13286,6 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB360_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB360_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB360_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB360_1; -; SM90-NEXT: $L__BB360_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM90: { @@ -16662,10 +13295,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16676,22 +13309,22 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB361_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB361_3; +; SM90-NEXT: @%p1 bra $L__BB288_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB361_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB361_1; -; SM90-NEXT: $L__BB361_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB288_1; +; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16708,10 +13341,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16722,22 +13355,22 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB362_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB362_3; +; SM90-NEXT: @%p1 bra $L__BB289_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB362_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB362_1; -; SM90-NEXT: $L__BB362_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB289_1; +; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16745,54 +13378,8 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB363_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB363_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB363_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB363_1; -; SM90-NEXT: $L__BB363_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -16800,10 +13387,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16814,31 +13401,31 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB364_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB364_3; +; SM90-NEXT: @%p1 bra $L__BB290_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB364_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB364_1; -; SM90-NEXT: $L__BB364_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB290_1; +; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic ret i16 %new } -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global( +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -16846,10 +13433,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16860,26 +13447,26 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB365_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB365_3; +; SM90-NEXT: @%p1 bra $L__BB291_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB365_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB365_1; -; SM90-NEXT: $L__BB365_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB291_1; +; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } @@ -16892,10 +13479,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16906,22 +13493,22 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB366_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB366_3; +; SM90-NEXT: @%p1 bra $L__BB292_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB366_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB366_1; -; SM90-NEXT: $L__BB366_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB292_1; +; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16938,10 +13525,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16952,22 +13539,22 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB367_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB367_3; +; SM90-NEXT: @%p1 bra $L__BB293_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB367_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB367_1; -; SM90-NEXT: $L__BB367_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB293_1; +; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -16984,10 +13571,10 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16998,22 +13585,22 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB368_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB368_3; +; SM90-NEXT: @%p1 bra $L__BB294_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB368_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB368_1; -; SM90-NEXT: $L__BB368_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB294_1; +; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17030,10 +13617,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17044,22 +13631,22 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB369_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB369_3; +; SM90-NEXT: @%p1 bra $L__BB295_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB369_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB369_1; -; SM90-NEXT: $L__BB369_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB295_1; +; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17067,52 +13654,6 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB370_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB370_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB370_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB370_1; -; SM90-NEXT: $L__BB370_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( ; SM90: { @@ -17122,10 +13663,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17136,22 +13677,22 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB371_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB371_3; +; SM90-NEXT: @%p1 bra $L__BB296_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB371_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB371_1; -; SM90-NEXT: $L__BB371_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB296_1; +; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17168,10 +13709,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17182,22 +13723,22 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB372_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB372_3; +; SM90-NEXT: @%p1 bra $L__BB297_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB372_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB372_1; -; SM90-NEXT: $L__BB372_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB297_1; +; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17214,10 +13755,10 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17228,22 +13769,22 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB373_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB373_3; +; SM90-NEXT: @%p1 bra $L__BB298_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB373_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB373_1; -; SM90-NEXT: $L__BB373_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB298_1; +; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17260,10 +13801,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17274,22 +13815,22 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB374_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB374_3; +; SM90-NEXT: @%p1 bra $L__BB299_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB374_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB374_1; -; SM90-NEXT: $L__BB374_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB299_1; +; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17297,52 +13838,6 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB375_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB375_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB375_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB375_1; -; SM90-NEXT: $L__BB375_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_acquire_i16_generic_sys( ; SM90: { @@ -17352,10 +13847,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17366,22 +13861,22 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB376_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB376_3; +; SM90-NEXT: @%p1 bra $L__BB300_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB376_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB376_1; -; SM90-NEXT: $L__BB376_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB300_1; +; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17398,10 +13893,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17412,22 +13907,22 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB377_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB377_3; +; SM90-NEXT: @%p1 bra $L__BB301_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB377_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB377_1; -; SM90-NEXT: $L__BB377_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB301_1; +; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17444,10 +13939,10 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17458,22 +13953,22 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB378_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB378_3; +; SM90-NEXT: @%p1 bra $L__BB302_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB378_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB378_1; -; SM90-NEXT: $L__BB378_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB302_1; +; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17490,10 +13985,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17504,22 +13999,22 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB379_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB379_3; +; SM90-NEXT: @%p1 bra $L__BB303_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB379_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB379_1; -; SM90-NEXT: $L__BB379_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB303_1; +; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17527,52 +14022,6 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB380_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB380_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB380_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB380_1; -; SM90-NEXT: $L__BB380_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_acquire_i16_global_sys( ; SM90: { @@ -17582,10 +14031,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17596,22 +14045,22 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB381_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB381_3; +; SM90-NEXT: @%p1 bra $L__BB304_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB381_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB381_1; -; SM90-NEXT: $L__BB381_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB304_1; +; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17628,10 +14077,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17642,22 +14091,22 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB382_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB382_3; +; SM90-NEXT: @%p1 bra $L__BB305_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB382_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB382_1; -; SM90-NEXT: $L__BB382_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB305_1; +; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17665,54 +14114,8 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB383_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB383_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB383_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB383_1; -; SM90-NEXT: $L__BB383_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_gpu( +define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cluster( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -17720,10 +14123,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17734,31 +14137,31 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB384_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB384_3; +; SM90-NEXT: @%p1 bra $L__BB306_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB384_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB384_1; -; SM90-NEXT: $L__BB384_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB306_1; +; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire ret i16 %new } -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared( +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -17766,10 +14169,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17780,26 +14183,26 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB385_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB385_3; +; SM90-NEXT: @%p1 bra $L__BB307_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB385_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB385_1; -; SM90-NEXT: $L__BB385_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB307_1; +; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } @@ -17812,10 +14215,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17826,22 +14229,22 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB386_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB386_3; +; SM90-NEXT: @%p1 bra $L__BB308_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB386_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB386_1; -; SM90-NEXT: $L__BB386_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB308_1; +; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17858,10 +14261,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17872,22 +14275,22 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB387_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB387_3; +; SM90-NEXT: @%p1 bra $L__BB309_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB387_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB387_1; -; SM90-NEXT: $L__BB387_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB309_1; +; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17904,10 +14307,10 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17918,22 +14321,22 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB388_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB388_3; +; SM90-NEXT: @%p1 bra $L__BB310_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB388_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB388_1; -; SM90-NEXT: $L__BB388_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB310_1; +; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17950,10 +14353,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -17964,22 +14367,22 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB389_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB389_3; +; SM90-NEXT: @%p1 bra $L__BB311_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB389_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB389_1; -; SM90-NEXT: $L__BB389_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB311_1; +; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -17987,52 +14390,6 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB390_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB390_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB390_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB390_1; -; SM90-NEXT: $L__BB390_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM90: { @@ -18042,10 +14399,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18056,22 +14413,22 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB391_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB391_3; +; SM90-NEXT: @%p1 bra $L__BB312_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB391_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB391_1; -; SM90-NEXT: $L__BB391_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB312_1; +; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18088,10 +14445,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18102,22 +14459,22 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB392_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB392_3; +; SM90-NEXT: @%p1 bra $L__BB313_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB392_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB392_1; -; SM90-NEXT: $L__BB392_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB313_1; +; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18134,10 +14491,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18148,22 +14505,22 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB393_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB393_3; +; SM90-NEXT: @%p1 bra $L__BB314_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB393_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB393_1; -; SM90-NEXT: $L__BB393_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB314_1; +; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18180,10 +14537,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18194,22 +14551,22 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB394_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB394_3; +; SM90-NEXT: @%p1 bra $L__BB315_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB394_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB394_1; -; SM90-NEXT: $L__BB394_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB315_1; +; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18217,52 +14574,6 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB395_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB395_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB395_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB395_1; -; SM90-NEXT: $L__BB395_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( ; SM90: { @@ -18272,10 +14583,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18286,22 +14597,22 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB396_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB396_3; +; SM90-NEXT: @%p1 bra $L__BB316_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB396_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB396_1; -; SM90-NEXT: $L__BB396_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB316_1; +; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18318,10 +14629,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18332,22 +14643,22 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB397_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB397_3; +; SM90-NEXT: @%p1 bra $L__BB317_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB397_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB397_1; -; SM90-NEXT: $L__BB397_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB317_1; +; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18364,10 +14675,10 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18378,22 +14689,22 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB398_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB398_3; +; SM90-NEXT: @%p1 bra $L__BB318_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB398_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB398_1; -; SM90-NEXT: $L__BB398_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB318_1; +; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18410,10 +14721,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18424,22 +14735,22 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB399_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB399_3; +; SM90-NEXT: @%p1 bra $L__BB319_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB399_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB399_1; -; SM90-NEXT: $L__BB399_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB319_1; +; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18447,52 +14758,6 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB400_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB400_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB400_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB400_1; -; SM90-NEXT: $L__BB400_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM90: { @@ -18502,10 +14767,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18516,22 +14781,22 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB401_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB401_3; +; SM90-NEXT: @%p1 bra $L__BB320_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB401_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB401_1; -; SM90-NEXT: $L__BB401_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB320_1; +; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18548,10 +14813,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18562,22 +14827,22 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB402_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB402_3; +; SM90-NEXT: @%p1 bra $L__BB321_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB402_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB402_1; -; SM90-NEXT: $L__BB402_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB321_1; +; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18594,56 +14859,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB403_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB403_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB403_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB403_1; -; SM90-NEXT: $L__BB403_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18654,31 +14873,31 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB404_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB404_3; +; SM90-NEXT: @%p1 bra $L__BB322_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB404_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB404_1; -; SM90-NEXT: $L__BB404_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB322_1; +; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic( +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -18686,10 +14905,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18700,26 +14919,26 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB405_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB405_3; +; SM90-NEXT: @%p1 bra $L__BB323_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB405_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB405_1; -; SM90-NEXT: $L__BB405_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB323_1; +; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } @@ -18732,10 +14951,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18746,22 +14965,22 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB406_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB406_3; +; SM90-NEXT: @%p1 bra $L__BB324_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB406_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB406_1; -; SM90-NEXT: $L__BB406_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB324_1; +; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18778,10 +14997,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18792,22 +15011,22 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB407_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB407_3; +; SM90-NEXT: @%p1 bra $L__BB325_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB407_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB407_1; -; SM90-NEXT: $L__BB407_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB325_1; +; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18824,10 +15043,10 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18838,22 +15057,22 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB408_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB408_3; +; SM90-NEXT: @%p1 bra $L__BB326_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB408_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB408_1; -; SM90-NEXT: $L__BB408_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB326_1; +; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18870,10 +15089,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18884,22 +15103,22 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB409_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB409_3; +; SM90-NEXT: @%p1 bra $L__BB327_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB409_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB409_1; -; SM90-NEXT: $L__BB409_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB327_1; +; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -18907,52 +15126,6 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB410_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB410_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB410_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB410_1; -; SM90-NEXT: $L__BB410_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_monotonic_i16_global_sys( ; SM90: { @@ -18962,10 +15135,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -18976,22 +15149,22 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB411_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB411_3; +; SM90-NEXT: @%p1 bra $L__BB328_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB411_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB411_1; -; SM90-NEXT: $L__BB411_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB328_1; +; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19008,10 +15181,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19022,22 +15195,22 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB412_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB412_3; +; SM90-NEXT: @%p1 bra $L__BB329_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB412_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB412_1; -; SM90-NEXT: $L__BB412_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB329_1; +; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19054,10 +15227,10 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19068,22 +15241,22 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB413_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB413_3; +; SM90-NEXT: @%p1 bra $L__BB330_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB413_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB413_1; -; SM90-NEXT: $L__BB413_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB330_1; +; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19100,10 +15273,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19114,22 +15287,22 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB414_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB414_3; +; SM90-NEXT: @%p1 bra $L__BB331_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB414_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB414_1; -; SM90-NEXT: $L__BB414_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB331_1; +; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19137,52 +15310,6 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB415_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB415_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB415_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB415_1; -; SM90-NEXT: $L__BB415_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM90: { @@ -19192,10 +15319,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19206,22 +15333,22 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB416_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB416_3; +; SM90-NEXT: @%p1 bra $L__BB332_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB416_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB416_1; -; SM90-NEXT: $L__BB416_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB332_1; +; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19238,10 +15365,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19252,22 +15379,22 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB417_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB417_3; +; SM90-NEXT: @%p1 bra $L__BB333_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB417_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB417_1; -; SM90-NEXT: $L__BB417_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB333_1; +; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19284,10 +15411,10 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19298,22 +15425,22 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB418_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB418_3; +; SM90-NEXT: @%p1 bra $L__BB334_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB418_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB418_1; -; SM90-NEXT: $L__BB418_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB334_1; +; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19330,10 +15457,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19344,22 +15471,22 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB419_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB419_3; +; SM90-NEXT: @%p1 bra $L__BB335_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB419_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB419_1; -; SM90-NEXT: $L__BB419_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB335_1; +; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19367,52 +15494,6 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ret i16 %new } -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB420_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB420_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB420_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB420_1; -; SM90-NEXT: $L__BB420_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_acquire_i16_generic_sys( ; SM90: { @@ -19422,10 +15503,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19436,22 +15517,22 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB421_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB421_3; +; SM90-NEXT: @%p1 bra $L__BB336_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB421_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB421_1; -; SM90-NEXT: $L__BB421_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB336_1; +; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19468,10 +15549,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19482,22 +15563,22 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB422_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB422_3; +; SM90-NEXT: @%p1 bra $L__BB337_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB422_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB422_1; -; SM90-NEXT: $L__BB422_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB337_1; +; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19514,10 +15595,10 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19528,22 +15609,22 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB423_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB423_3; +; SM90-NEXT: @%p1 bra $L__BB338_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB423_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB423_1; -; SM90-NEXT: $L__BB423_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB338_1; +; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19551,54 +15632,8 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB424_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB424_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB424_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB424_1; -; SM90-NEXT: $L__BB424_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global( +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -19606,10 +15641,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19620,26 +15655,26 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB425_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB425_3; +; SM90-NEXT: @%p1 bra $L__BB339_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB425_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB425_1; -; SM90-NEXT: $L__BB425_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB339_1; +; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } @@ -19652,10 +15687,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19666,22 +15701,22 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB426_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB426_3; +; SM90-NEXT: @%p1 bra $L__BB340_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB426_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB426_1; -; SM90-NEXT: $L__BB426_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB340_1; +; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19698,10 +15733,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19712,22 +15747,22 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB427_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB427_3; +; SM90-NEXT: @%p1 bra $L__BB341_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB427_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB427_1; -; SM90-NEXT: $L__BB427_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB341_1; +; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19744,10 +15779,10 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19758,22 +15793,22 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB428_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB428_3; +; SM90-NEXT: @%p1 bra $L__BB342_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB428_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB428_1; -; SM90-NEXT: $L__BB428_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB342_1; +; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19790,10 +15825,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19804,22 +15839,22 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB429_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB429_3; +; SM90-NEXT: @%p1 bra $L__BB343_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB429_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB429_1; -; SM90-NEXT: $L__BB429_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB343_1; +; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19827,52 +15862,6 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB430_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB430_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB430_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB430_1; -; SM90-NEXT: $L__BB430_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_acquire_i16_shared_sys( ; SM90: { @@ -19882,10 +15871,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19896,22 +15885,22 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB431_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB431_3; +; SM90-NEXT: @%p1 bra $L__BB344_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB431_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB431_1; -; SM90-NEXT: $L__BB431_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB344_1; +; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19928,10 +15917,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19942,22 +15931,22 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB432_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB432_3; +; SM90-NEXT: @%p1 bra $L__BB345_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB432_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB432_1; -; SM90-NEXT: $L__BB432_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB345_1; +; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -19974,10 +15963,10 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -19988,22 +15977,22 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB433_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB433_3; +; SM90-NEXT: @%p1 bra $L__BB346_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB433_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB433_1; -; SM90-NEXT: $L__BB433_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB346_1; +; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20020,10 +16009,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20034,22 +16023,22 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB434_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB434_3; +; SM90-NEXT: @%p1 bra $L__BB347_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB434_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB434_1; -; SM90-NEXT: $L__BB434_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB347_1; +; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20057,52 +16046,6 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB435_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB435_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB435_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB435_1; -; SM90-NEXT: $L__BB435_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM90: { @@ -20112,10 +16055,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20126,22 +16069,22 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB436_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB436_3; +; SM90-NEXT: @%p1 bra $L__BB348_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB436_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB436_1; -; SM90-NEXT: $L__BB436_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB348_1; +; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20158,10 +16101,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20172,22 +16115,22 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB437_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB437_3; +; SM90-NEXT: @%p1 bra $L__BB349_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB437_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB437_1; -; SM90-NEXT: $L__BB437_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB349_1; +; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20204,10 +16147,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20218,22 +16161,22 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB438_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB438_3; +; SM90-NEXT: @%p1 bra $L__BB350_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB438_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB438_1; -; SM90-NEXT: $L__BB438_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB350_1; +; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20250,10 +16193,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20264,22 +16207,22 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB439_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB439_3; +; SM90-NEXT: @%p1 bra $L__BB351_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB439_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB439_1; -; SM90-NEXT: $L__BB439_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB351_1; +; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20287,52 +16230,6 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ret i16 %new } -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB440_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB440_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB440_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB440_1; -; SM90-NEXT: $L__BB440_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( ; SM90: { @@ -20342,10 +16239,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20356,22 +16253,22 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB441_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB441_3; +; SM90-NEXT: @%p1 bra $L__BB352_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB441_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB441_1; -; SM90-NEXT: $L__BB441_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB352_1; +; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20388,10 +16285,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20402,22 +16299,22 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB442_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB442_3; +; SM90-NEXT: @%p1 bra $L__BB353_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB442_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB442_1; -; SM90-NEXT: $L__BB442_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB353_1; +; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20434,10 +16331,10 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20448,22 +16345,22 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB443_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB443_3; +; SM90-NEXT: @%p1 bra $L__BB354_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB443_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB443_1; -; SM90-NEXT: $L__BB443_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB354_1; +; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20480,10 +16377,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20494,22 +16391,22 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: ld.global.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB444_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB444_3; +; SM90-NEXT: @%p1 bra $L__BB355_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB444_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB444_1; -; SM90-NEXT: $L__BB444_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB355_1; +; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20517,52 +16414,6 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB445_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB445_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB445_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB445_1; -; SM90-NEXT: $L__BB445_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys( ; SM90: { @@ -20572,10 +16423,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20586,22 +16437,22 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB446_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB446_3; +; SM90-NEXT: @%p1 bra $L__BB356_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB446_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB446_1; -; SM90-NEXT: $L__BB446_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB356_1; +; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20618,10 +16469,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20632,22 +16483,22 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB447_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB447_3; +; SM90-NEXT: @%p1 bra $L__BB357_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB447_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB447_1; -; SM90-NEXT: $L__BB447_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB357_1; +; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20664,10 +16515,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20678,22 +16529,22 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB448_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB448_3; +; SM90-NEXT: @%p1 bra $L__BB358_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB448_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB448_1; -; SM90-NEXT: $L__BB448_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB358_1; +; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20710,10 +16561,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -20724,22 +16575,22 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB449_1: // %partword.cmpxchg.loop +; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB449_3; +; SM90-NEXT: @%p1 bra $L__BB359_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB449_1 Depth=1 +; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB449_1; -; SM90-NEXT: $L__BB449_3: // %partword.cmpxchg.end +; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB359_1; +; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; @@ -20747,23 +16598,6 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_generic_sys( ; SM90: { @@ -20771,9 +16605,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20788,9 +16622,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20805,9 +16639,9 @@ define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20822,9 +16656,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20832,23 +16666,6 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_global_sys( ; SM90: { @@ -20856,9 +16673,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20873,9 +16690,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20890,9 +16707,9 @@ define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20907,9 +16724,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20917,23 +16734,6 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_monotonic_i32_shared_sys( ; SM90: { @@ -20941,9 +16741,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20958,9 +16758,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20975,9 +16775,9 @@ define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -20992,9 +16792,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21002,23 +16802,6 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ret i32 %new } -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_generic_sys( ; SM90: { @@ -21026,9 +16809,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21043,9 +16826,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21060,9 +16843,9 @@ define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21077,9 +16860,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21087,23 +16870,6 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_global_sys( ; SM90: { @@ -21111,9 +16877,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21128,9 +16894,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21145,9 +16911,9 @@ define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21162,9 +16928,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21172,23 +16938,6 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_acquire_i32_shared_sys( ; SM90: { @@ -21196,9 +16945,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21213,9 +16962,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21230,9 +16979,9 @@ define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21247,9 +16996,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21257,24 +17006,6 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM90: { @@ -21282,10 +17013,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21300,10 +17031,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21318,10 +17049,10 @@ define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21336,10 +17067,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21347,24 +17078,6 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_global_sys( ; SM90: { @@ -21372,10 +17085,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21390,10 +17103,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21408,10 +17121,10 @@ define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21426,10 +17139,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21437,24 +17150,6 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM90: { @@ -21462,10 +17157,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21480,10 +17175,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21498,10 +17193,10 @@ define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21516,10 +17211,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21527,23 +17222,6 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_generic_sys( ; SM90: { @@ -21551,9 +17229,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21568,9 +17246,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21585,9 +17263,9 @@ define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21602,9 +17280,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21612,23 +17290,6 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_global_sys( ; SM90: { @@ -21636,9 +17297,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21653,9 +17314,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21670,9 +17331,9 @@ define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21687,9 +17348,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21697,23 +17358,6 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_monotonic_i32_shared_sys( ; SM90: { @@ -21721,9 +17365,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21738,9 +17382,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21755,9 +17399,9 @@ define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21772,9 +17416,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21782,23 +17426,6 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_generic_sys( ; SM90: { @@ -21806,9 +17433,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21823,9 +17450,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21840,9 +17467,9 @@ define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21857,9 +17484,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21867,23 +17494,6 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_global_sys( ; SM90: { @@ -21891,9 +17501,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21908,9 +17518,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21925,9 +17535,9 @@ define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21942,9 +17552,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21952,23 +17562,6 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_acquire_i32_shared_sys( ; SM90: { @@ -21976,9 +17569,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -21993,9 +17586,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22010,9 +17603,9 @@ define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22027,9 +17620,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22037,24 +17630,6 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_generic_sys( ; SM90: { @@ -22062,10 +17637,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22080,10 +17655,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22098,10 +17673,10 @@ define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22116,10 +17691,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22127,24 +17702,6 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_global_sys( ; SM90: { @@ -22152,10 +17709,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22170,10 +17727,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22188,10 +17745,10 @@ define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22206,10 +17763,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22217,24 +17774,6 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acquire_seq_cst_i32_shared_sys( ; SM90: { @@ -22242,10 +17781,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22260,10 +17799,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22278,10 +17817,10 @@ define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22296,10 +17835,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22307,23 +17846,6 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_monotonic_i32_generic_sys( ; SM90: { @@ -22331,9 +17853,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22348,9 +17870,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22365,9 +17887,9 @@ define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22382,9 +17904,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22392,23 +17914,6 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_monotonic_i32_global_sys( ; SM90: { @@ -22416,9 +17921,9 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22433,9 +17938,9 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22444,53 +17949,36 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i } define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_gpu( +; SM90-LABEL: release_monotonic_i32_global_cluster( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic ret i32 %new } -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared( +define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_gpu( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } @@ -22501,9 +17989,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22518,9 +18006,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22535,9 +18023,9 @@ define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22552,9 +18040,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22562,23 +18050,6 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_generic_sys( ; SM90: { @@ -22586,9 +18057,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22603,9 +18074,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22620,9 +18091,9 @@ define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22637,9 +18108,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22647,23 +18118,6 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_global_sys( ; SM90: { @@ -22671,9 +18125,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22688,9 +18142,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22705,9 +18159,9 @@ define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22722,9 +18176,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22732,23 +18186,6 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_acquire_i32_shared_sys( ; SM90: { @@ -22756,9 +18193,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22773,9 +18210,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22790,9 +18227,9 @@ define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22807,9 +18244,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22817,24 +18254,6 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_generic_sys( ; SM90: { @@ -22842,10 +18261,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22860,10 +18279,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22878,10 +18297,10 @@ define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22896,10 +18315,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22907,24 +18326,6 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_global_sys( ; SM90: { @@ -22932,10 +18333,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22950,10 +18351,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22968,10 +18369,10 @@ define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22986,10 +18387,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -22997,24 +18398,6 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: release_seq_cst_i32_shared_sys( ; SM90: { @@ -23022,10 +18405,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23040,10 +18423,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23058,10 +18441,10 @@ define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23076,10 +18459,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23087,23 +18470,6 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM90: { @@ -23111,9 +18477,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23128,9 +18494,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23145,9 +18511,9 @@ define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23162,9 +18528,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23172,23 +18538,6 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_global_sys( ; SM90: { @@ -23196,9 +18545,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23213,9 +18562,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23230,9 +18579,9 @@ define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23247,9 +18596,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23257,23 +18606,6 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM90: { @@ -23281,9 +18613,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23298,9 +18630,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23315,9 +18647,9 @@ define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23332,9 +18664,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23342,23 +18674,6 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_generic_sys( ; SM90: { @@ -23366,9 +18681,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23383,9 +18698,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23400,9 +18715,9 @@ define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23417,9 +18732,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23427,23 +18742,6 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_global_sys( ; SM90: { @@ -23451,9 +18749,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23468,9 +18766,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23485,9 +18783,9 @@ define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23502,9 +18800,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23512,23 +18810,6 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_acquire_i32_shared_sys( ; SM90: { @@ -23536,9 +18817,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23553,9 +18834,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23570,9 +18851,9 @@ define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23587,9 +18868,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23597,24 +18878,6 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM90: { @@ -23622,10 +18885,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23640,10 +18903,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23658,10 +18921,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23676,10 +18939,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23687,24 +18950,6 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM90: { @@ -23712,10 +18957,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23730,10 +18975,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23748,10 +18993,10 @@ define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23766,10 +19011,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23777,24 +19022,6 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM90: { @@ -23802,10 +19029,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23820,10 +19047,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23838,10 +19065,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23856,10 +19083,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23867,24 +19094,6 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM90: { @@ -23892,10 +19101,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23910,10 +19119,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23928,10 +19137,10 @@ define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23946,10 +19155,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -23957,24 +19166,6 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_global_sys( ; SM90: { @@ -23982,10 +19173,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24000,10 +19191,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24018,10 +19209,10 @@ define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24036,10 +19227,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24047,24 +19238,6 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( ; SM90: { @@ -24072,10 +19245,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24090,10 +19263,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24108,10 +19281,10 @@ define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24126,10 +19299,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24137,24 +19310,6 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ret i32 %new } -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_acquire_i32_generic_sys( ; SM90: { @@ -24162,10 +19317,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24180,10 +19335,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24198,10 +19353,10 @@ define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24216,10 +19371,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24227,24 +19382,6 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_acquire_i32_global_sys( ; SM90: { @@ -24252,10 +19389,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24270,10 +19407,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24288,10 +19425,10 @@ define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24306,32 +19443,14 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } @@ -24342,10 +19461,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24360,10 +19479,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24378,10 +19497,10 @@ define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24396,10 +19515,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24407,24 +19526,6 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM90: { @@ -24432,10 +19533,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24450,10 +19551,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24468,10 +19569,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24486,10 +19587,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24497,24 +19598,6 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ret i32 %new } -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM90: { @@ -24522,10 +19605,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24540,10 +19623,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24558,10 +19641,10 @@ define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24576,10 +19659,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24587,24 +19670,6 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ret i32 %new } -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM90: { @@ -24612,10 +19677,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24630,10 +19695,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24648,10 +19713,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24666,10 +19731,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -24677,31 +19742,15 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ret i32 %new } -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24715,9 +19764,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24731,9 +19780,9 @@ define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24747,9 +19796,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24757,31 +19806,15 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24795,9 +19828,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24811,9 +19844,9 @@ define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24827,9 +19860,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24837,31 +19870,15 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24875,9 +19892,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24891,9 +19908,9 @@ define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24907,9 +19924,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24917,31 +19934,15 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ret i64 %new } -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24955,9 +19956,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24971,9 +19972,9 @@ define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24987,9 +19988,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -24997,31 +19998,15 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25035,9 +20020,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25051,9 +20036,9 @@ define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25067,9 +20052,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25077,31 +20062,15 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25115,9 +20084,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25131,9 +20100,9 @@ define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25147,9 +20116,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25157,33 +20126,16 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25197,10 +20149,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25214,10 +20166,10 @@ define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25231,10 +20183,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25242,33 +20194,16 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25282,10 +20217,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25299,10 +20234,10 @@ define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25316,10 +20251,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25327,33 +20262,16 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25367,10 +20285,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25384,10 +20302,10 @@ define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25401,10 +20319,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25412,31 +20330,15 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25450,9 +20352,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25466,9 +20368,9 @@ define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25482,9 +20384,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25492,31 +20394,15 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25530,9 +20416,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25546,9 +20432,9 @@ define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25562,9 +20448,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25572,31 +20458,15 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25610,9 +20480,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25626,9 +20496,9 @@ define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25642,9 +20512,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25652,31 +20522,15 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25690,9 +20544,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25706,9 +20560,9 @@ define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25722,9 +20576,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25732,31 +20586,15 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25770,9 +20608,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25786,9 +20624,9 @@ define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25802,9 +20640,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25812,31 +20650,15 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25850,9 +20672,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25866,9 +20688,9 @@ define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25882,9 +20704,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25892,33 +20714,16 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25932,10 +20737,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25949,10 +20754,10 @@ define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25966,10 +20771,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -25977,33 +20782,16 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26017,10 +20805,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26034,10 +20822,10 @@ define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26051,10 +20839,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26062,33 +20850,16 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acquire_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26102,10 +20873,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26119,10 +20890,10 @@ define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26136,10 +20907,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26147,31 +20918,15 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26185,9 +20940,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26201,9 +20956,9 @@ define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26217,9 +20972,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26227,31 +20982,15 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26265,9 +21004,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26281,9 +21020,9 @@ define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26297,29 +21036,13 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } @@ -26329,9 +21052,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26345,9 +21068,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26361,9 +21084,9 @@ define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26377,9 +21100,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26387,31 +21110,15 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26425,9 +21132,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26441,9 +21148,9 @@ define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26457,9 +21164,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26467,31 +21174,15 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26505,9 +21196,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26521,9 +21212,9 @@ define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26537,9 +21228,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26547,31 +21238,15 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26585,9 +21260,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26601,9 +21276,9 @@ define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26617,9 +21292,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26627,33 +21302,16 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26667,10 +21325,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26684,10 +21342,10 @@ define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26701,10 +21359,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26712,33 +21370,16 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26752,10 +21393,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26769,10 +21410,10 @@ define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26786,10 +21427,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26797,33 +21438,16 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: release_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26837,10 +21461,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26854,10 +21478,10 @@ define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26871,10 +21495,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26882,31 +21506,15 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26920,9 +21528,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26936,9 +21544,9 @@ define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26952,9 +21560,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -26962,31 +21570,15 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27000,9 +21592,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27016,9 +21608,9 @@ define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27032,9 +21624,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27042,31 +21634,15 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27080,9 +21656,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27096,9 +21672,9 @@ define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27112,9 +21688,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27122,31 +21698,15 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27160,9 +21720,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27176,9 +21736,9 @@ define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27192,9 +21752,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27202,31 +21762,15 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27240,9 +21784,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27256,9 +21800,9 @@ define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27272,9 +21816,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27282,31 +21826,15 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27320,9 +21848,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27336,9 +21864,9 @@ define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27352,9 +21880,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27362,33 +21890,16 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27402,10 +21913,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27419,10 +21930,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27436,10 +21947,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27447,33 +21958,16 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27487,10 +21981,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27504,10 +21998,10 @@ define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27521,10 +22015,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27532,33 +22026,16 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27572,10 +22049,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27589,10 +22066,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27606,10 +22083,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27617,33 +22094,16 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27657,10 +22117,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27674,10 +22134,10 @@ define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27691,10 +22151,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27702,33 +22162,16 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27742,10 +22185,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27759,10 +22202,10 @@ define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27776,10 +22219,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27787,33 +22230,16 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new -} - define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27827,10 +22253,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27844,10 +22270,10 @@ define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27861,10 +22287,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27872,33 +22298,16 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ret i64 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27912,10 +22321,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27929,10 +22338,10 @@ define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27946,10 +22355,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27957,33 +22366,16 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -27997,10 +22389,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28014,10 +22406,10 @@ define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28031,10 +22423,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28042,33 +22434,16 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new -} - define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_acquire_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28082,10 +22457,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28099,10 +22474,10 @@ define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28116,10 +22491,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28127,33 +22502,16 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28167,10 +22525,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28184,10 +22542,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28201,10 +22559,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28212,33 +22570,16 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28252,10 +22593,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28269,10 +22610,10 @@ define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28286,10 +22627,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28297,33 +22638,16 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new -} - define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28337,10 +22661,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28354,10 +22678,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -28371,10 +22695,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 547b13136ff93..f5c22664394b5 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -18,21 +18,21 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -48,7 +48,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( @@ -59,21 +59,21 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -89,7 +89,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i8( ; SM90: { @@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -144,21 +144,21 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -175,7 +175,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i8( @@ -186,21 +186,21 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -217,7 +217,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i8( ; SM90: { @@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -273,22 +273,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -304,7 +304,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i8( @@ -315,22 +315,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -346,7 +346,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i8( ; SM90: { @@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -402,22 +402,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -434,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i8( @@ -445,22 +445,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -477,7 +477,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i8( ; SM90: { @@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -534,22 +534,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; +; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; +; SM30-NEXT: cvt.u32.u64 %r9, %rd2; +; SM30-NEXT: and.b32 %r10, %r9, 3; +; SM30-NEXT: shl.b32 %r1, %r10, 3; +; SM30-NEXT: mov.b32 %r11, 255; +; SM30-NEXT: shl.b32 %r12, %r11, %r1; +; SM30-NEXT: not.b32 %r2, %r12; +; SM30-NEXT: cvt.u32.u16 %r13, %rs1; +; SM30-NEXT: and.b32 %r14, %r13, 255; +; SM30-NEXT: shl.b32 %r3, %r14, %r1; +; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.u32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -566,7 +566,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r13; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i8( @@ -577,22 +577,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; +; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: cvt.u32.u64 %r9, %rd2; +; SM70-NEXT: and.b32 %r10, %r9, 3; +; SM70-NEXT: shl.b32 %r1, %r10, 3; +; SM70-NEXT: mov.b32 %r11, 255; +; SM70-NEXT: shl.b32 %r12, %r11, %r1; +; SM70-NEXT: not.b32 %r2, %r12; +; SM70-NEXT: cvt.u32.u16 %r13, %rs1; +; SM70-NEXT: and.b32 %r14, %r13, 255; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; +; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.u32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -609,7 +609,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r13; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i8( ; SM90: { @@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; +; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; +; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.u32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -667,10 +667,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -680,7 +680,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; +; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -707,10 +707,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -720,7 +720,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -790,10 +790,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; +; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -803,7 +803,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; +; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -831,10 +831,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -844,7 +844,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -916,10 +916,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -930,7 +930,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; +; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -957,10 +957,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -971,7 +971,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1042,10 +1042,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1056,7 +1056,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; +; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1084,10 +1084,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1098,7 +1098,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1172,10 +1172,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; -; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; +; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1186,7 +1186,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; +; SM30-NEXT: ld.u32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1214,10 +1214,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; +; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1228,7 +1228,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; +; SM70-NEXT: ld.u32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; +; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; +; SM90-NEXT: ld.u32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1300,9 +1300,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; -; SM30-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; -; SM30-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1313,9 +1313,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1342,9 +1342,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; -; SM30-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; -; SM30-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1355,9 +1355,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1384,9 +1384,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; -; SM30-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; -; SM30-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1397,9 +1397,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1426,9 +1426,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; -; SM30-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; -; SM30-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1439,9 +1439,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; ; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; +; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; ; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1468,10 +1468,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; -; SM30-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; +; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1482,10 +1482,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; +; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; +; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; +; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; @@ -1514,9 +1514,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; -; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; -; SM30-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1526,9 +1526,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1553,9 +1553,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; -; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; -; SM30-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1565,9 +1565,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1592,9 +1592,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; -; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; -; SM30-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1604,9 +1604,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1631,9 +1631,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; -; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; -; SM30-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; +; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1643,9 +1643,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; +; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; ; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; ; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1670,10 +1670,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; +; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; -; SM30-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; +; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1683,10 +1683,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; +; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; +; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; +; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; +; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; +; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index 263627fea8a50..04a9253df7b9e 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -46,9 +46,12 @@ for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES + for size, success, failure, addrspace, llvm_scope in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES ): + # cluster ordering is supported from SM90 onwards + if sm != 90 and llvm_scope == "cluster": + continue if addrspace == 0: addrspace_cast = "" else: @@ -61,6 +64,8 @@ size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast, + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, ) From 66d415f3154ee011728ac5f8ccbf4732e187a497 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 21:06:11 +0000 Subject: [PATCH 08/26] clang-format --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 +++--- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2fe4adfdaacb4..5b8f4496529fa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6292,9 +6292,9 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); auto *CI = cast(Inst); - auto CASWidth = cast(CI->getCompareOperand()->getType()) - ->getBitWidth(); - SyncScope::ID SSID= CI->getSyncScopeID(); + auto CASWidth = + cast(CI->getCompareOperand()->getType())->getBitWidth(); + SyncScope::ID SSID = CI->getSyncScopeID(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 6a944116edb88..2477e1fb61595 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -264,12 +264,10 @@ class NVPTXTargetLowering : public TargetLowering { AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override; - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override; From c719cb23aa8d8b65f85682656b3a8d0f0884fb1d Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 21:22:13 +0000 Subject: [PATCH 09/26] black format --- llvm/test/CodeGen/NVPTX/cmpxchg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index 04a9253df7b9e..367fc885c0f8c 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -70,7 +70,7 @@ file=fp, ) - for llvm_scope in LLVM_SCOPES: + for llvm_scope in LLVM_SCOPES: # cluster ordering is supported from SM90 onwards if sm < 90 and llvm_scope == "cluster": continue From 9f167c0f4d9d1347e4dc6c88026d83a08803ed3a Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 30 May 2025 23:50:39 +0000 Subject: [PATCH 10/26] address review comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 5b8f4496529fa..eeeef994a7f38 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6275,10 +6275,11 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, - SSID) - : Builder.CreateFence(AtomicOrdering::Release, SSID); + return Builder.CreateFence( + Ord == AtomicOrdering::SequentiallyConsistent + ? AtomicOrdering::SequentiallyConsistent + : AtomicOrdering::Release, + SSID); return nullptr; } From d32fcb0efd9ca32509073bb0c39748e6b98eea97 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Mon, 2 Jun 2025 15:42:48 +0000 Subject: [PATCH 11/26] clang-format --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index eeeef994a7f38..0747240a4041c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6275,11 +6275,10 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) - return Builder.CreateFence( - Ord == AtomicOrdering::SequentiallyConsistent - ? AtomicOrdering::SequentiallyConsistent - : AtomicOrdering::Release, - SSID); + return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent + ? AtomicOrdering::SequentiallyConsistent + : AtomicOrdering::Release, + SSID); return nullptr; } From 6b2e54c92f90beda505ce8b2d1394877ae13f60d Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Mon, 30 Jun 2025 23:43:58 +0000 Subject: [PATCH 12/26] Define new CMPXCHG instruction which takes sem, scope, and addressspace as constant operands, simplify codegen --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 15 ++-- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 2 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 +++++++++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 74 +++++++++++++------ 5 files changed, 101 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 443db4391a523..0a56404e6862f 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -268,7 +268,7 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, llvm_unreachable("Empty Modifier"); } -void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, +void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); @@ -286,6 +286,12 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::Ordering::Release: O << ".release"; return; + case NVPTX::Ordering::AcquireRelease: + O << ".acq_rel"; + return; + case NVPTX::Ordering::SequentiallyConsistent: + O << ".seq_cst"; + return; case NVPTX::Ordering::Volatile: O << ".volatile"; return; @@ -294,8 +300,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; default: report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" sem modifier. " - "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.", + "NVPTX AtomicCode Printer does not support \"{}\" sem modifier. ", OrderingToString(Ordering))); } } else if (Modifier == "scope") { @@ -317,7 +322,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; } report_fatal_error( - formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.", + formatv("NVPTX AtomicCode Printer does not support \"{}\" scope modifier.", ScopeToString(S))); } else if (Modifier == "addsp") { auto A = NVPTX::AddressSpace(Imm); @@ -334,7 +339,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; } report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.", + "NVPTX AtomicCode Printer does not support \"{}\" addsp modifier.", AddressSpaceToString(A))); } else if (Modifier == "sign") { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 193c436939f66..9e879c78a6906 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -40,7 +40,7 @@ class NVPTXInstPrinter : public MCInstPrinter { StringRef Modifier = {}); void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); - void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O, + void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 429d52fb6f230..da83cdbfcfa79 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -304,6 +304,7 @@ void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) { } } + bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); switch (IID) { @@ -513,6 +514,40 @@ static unsigned int getCodeAddrSpace(const MemSDNode *N) { .value_or(NVPTX::AddressSpace::Generic); } +unsigned int NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) const { + return convertAS(N->getMemOperand()->getAddrSpace()) + .value_or(NVPTX::AddressSpace::Generic); +} + +unsigned int NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { + // No "sem" orderings for SM/PTX versions which do not support memory ordering + if (!Subtarget->hasMemoryOrdering()) + return NVPTX::Ordering::NotAtomic; + auto Ordering = N->getMergedOrdering(); + switch (Ordering) { + case AtomicOrdering::NotAtomic: + case AtomicOrdering::Unordered: + return NVPTX::Ordering::NotAtomic; + case AtomicOrdering::Monotonic: + return NVPTX::Ordering::Relaxed; + case AtomicOrdering::Acquire: + return NVPTX::Ordering::Acquire; + case AtomicOrdering::Release: + return NVPTX::Ordering::Release; + case AtomicOrdering::AcquireRelease: + return NVPTX::Ordering::AcquireRelease; + case AtomicOrdering::SequentiallyConsistent: + return NVPTX::Ordering::SequentiallyConsistent; + } +} + +unsigned int NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { + // No "scope" modifier for SM/PTX versions which do not support scoped atomics + if (!Subtarget->hasAtomScope() || !Subtarget->hasMemoryOrdering()) + return NVPTX::Scope::Thread; + return Scopes[N->getSyncScopeID()]; +} + namespace { struct OperationOrderings { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b314c4ccefe8b..2228f292b3e2a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -102,6 +102,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } + unsigned int getAddrSpace(const MemSDNode *N) const; + unsigned int getMemOrder(const MemSDNode *N) const; + unsigned int getAtomicScope(const MemSDNode *N) const; bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset); SDValue getPTXCmpMode(const CondCodeSDNode &CondCode); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index cc0df4d3f0900..f51f0b2343ecf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1873,6 +1873,54 @@ multiclass F_ATOMIC_3 { + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str # " \t$dst, [$addr], $b, $c;"; + + let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { + def _rr : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str, []>; + + def _ir : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str, []>; + + def _ri : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str, []>; + + def _ii : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str, []>; + } +} + +multiclass F_ATOMIC_3_MANYOPERAND_PATTERN { + defvar GetSem = SDNodeXForm(N)), SDLoc(N)); + }]>; + + defvar GetScope = SDNodeXForm(N)), SDLoc(N)); + }]>; + + defvar GetAddSp = SDNodeXForm(N)), SDLoc(N)); + }]>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c), + (!cast(InstructionName#_rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c), + (!cast(InstructionName#_ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)), + (!cast(InstructionName#_#ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)), + (!cast(InstructionName#_#ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; +} + multiclass F_ATOMIC_2_AS preds = []> { defvar frag_pat = (frag node:$a, node:$b); defm _G : F_ATOMIC_2, preds>; @@ -1934,29 +1982,11 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS("atomic_cmp_swap_i"#t.Size#_#order); - - // Instantiate scoped versions of the atomic compare and swap pattern - defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped; - - foreach scope = ["cta", "cluster", "gpu", "sys"] in { - defvar atomic_cmp_swap_pat_scoped = !cast("atomic_cmp_swap_i"#t.Size#_#order#_#scope); + defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size); + defm INT_PTX_ATOM_CAS_#t.Size + : F_ATOMIC_3_MANYOPERAND; - // Syncscope is only supported for SM70+ - defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope - : F_ATOMIC_3_AS, hasPTX<63>]>; - } - - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#t.Size#_#order - : F_ATOMIC_3_AS, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old - : F_ATOMIC_3_AS; - } + defm INT_PTX_ATOM_CAS_PAT_#t.Size : F_ATOMIC_3_MANYOPERAND_PATTERN; } // Note that 16-bit CAS support in PTX is emulated. From 28024fcf36c689e12520739d91275194cc0789c1 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 00:44:41 +0000 Subject: [PATCH 13/26] SM60 supports scope on atom.cas --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 2 +- llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll | 30 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 14312 ++++++++++++------ 3 files changed, 10015 insertions(+), 4329 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index da83cdbfcfa79..5cf78454894b0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -543,7 +543,7 @@ unsigned int NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { unsigned int NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { // No "scope" modifier for SM/PTX versions which do not support scoped atomics - if (!Subtarget->hasAtomScope() || !Subtarget->hasMemoryOrdering()) + if (!Subtarget->hasAtomScope()) return NVPTX::Scope::Thread; return Scopes[N->getSyncScopeID()]; } diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll index c7a0c60ae1f4d..94b3f0a2e1c3e 100644 --- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll @@ -93,7 +93,8 @@ entry: %3 = atomicrmw or ptr %0, i8 %1 monotonic, align 1 ; ALL: atom.xor.b32 %4 = atomicrmw xor ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw xchg ptr %0, i8 %1 monotonic, align 1 ret void } @@ -101,13 +102,17 @@ entry: ; CHECK-LABEL: minmax_i8 define void @minmax_i8(ptr %0, i8 %1) { entry: - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %2 = atomicrmw min ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %3 = atomicrmw max ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %4 = atomicrmw umin ptr %0, i8 %1 monotonic, align 1 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw umax ptr %0, i8 %1 monotonic, align 1 ret void } @@ -121,7 +126,8 @@ entry: %3 = atomicrmw or ptr %0, i16 %1 monotonic, align 2 ; ALL: atom.xor.b32 %4 = atomicrmw xor ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw xchg ptr %0, i16 %1 monotonic, align 2 ret void } @@ -129,13 +135,17 @@ entry: ; CHECK-LABEL: minmax_i16 define void @minmax_i16(ptr %0, i16 %1) { entry: - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %2 = atomicrmw min ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %3 = atomicrmw max ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %4 = atomicrmw umin ptr %0, i16 %1 monotonic, align 2 - ; ALL: atom.cas.b32 + ; SM30: atom.cas.b32 + ; SM60: atom.sys.cas.b32 %5 = atomicrmw umax ptr %0, i16 %1 monotonic, align 2 ret void } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 40e365c886c42..8d5800eccef9d 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -178,12 +178,12 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -223,12 +223,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -268,12 +268,12 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -313,12 +313,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -358,12 +358,12 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -403,12 +403,12 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -416,8 +416,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -446,15 +446,14 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -462,8 +461,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -474,9 +473,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -492,15 +491,14 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -508,8 +506,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -520,9 +518,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -538,15 +536,14 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_sys( +define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -554,8 +551,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -566,9 +563,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -587,12 +584,12 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_cta( +define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -600,8 +597,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -612,9 +609,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -630,15 +627,15 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -646,8 +643,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -658,9 +655,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -676,15 +673,15 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_sys( +define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -692,8 +689,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -704,9 +701,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -722,15 +719,15 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_cta( +define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -738,8 +735,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -750,9 +747,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -768,15 +765,15 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_gpu( +define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -784,8 +781,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -796,9 +793,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -814,15 +811,15 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -830,9 +827,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -843,9 +839,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -861,15 +857,15 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -877,9 +873,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -890,9 +885,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -908,15 +903,15 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -924,9 +919,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -937,9 +931,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -955,15 +949,15 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_sys( +define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -971,9 +965,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -984,9 +977,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1005,12 +998,12 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_cta( +define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1018,9 +1011,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1031,9 +1023,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1052,12 +1044,12 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( +define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1065,9 +1057,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1078,9 +1069,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1099,12 +1090,12 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1112,8 +1103,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1125,9 +1116,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1146,12 +1137,12 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1159,9 +1150,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1172,9 +1163,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1190,15 +1181,15 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1206,9 +1197,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1219,9 +1210,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1237,15 +1228,15 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_sys( +define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1253,8 +1244,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1265,9 +1257,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1283,15 +1275,15 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1299,8 +1291,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1311,9 +1304,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1329,15 +1322,15 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1345,8 +1338,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1357,9 +1351,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1375,15 +1369,15 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_sys( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1391,8 +1385,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1403,9 +1398,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1421,15 +1416,15 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_cta( +define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1437,8 +1432,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1449,9 +1445,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1467,15 +1463,15 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_gpu( +define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1483,8 +1479,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1495,9 +1492,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1513,15 +1510,15 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_sys( +define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1529,8 +1526,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1541,9 +1539,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1562,12 +1560,12 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_cta( +define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1575,8 +1573,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1587,9 +1586,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1608,12 +1607,12 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_gpu( +define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1621,8 +1620,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1633,9 +1633,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1654,12 +1654,12 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new } -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_sys( +define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1667,8 +1667,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1679,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1700,12 +1700,12 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_cta( +define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1713,8 +1713,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1743,15 +1743,15 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_gpu( +define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1759,8 +1759,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1789,15 +1789,15 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_sys( +define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1805,8 +1805,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1835,15 +1835,15 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_cta( +define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1851,8 +1851,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1863,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1881,15 +1881,15 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_gpu( +define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1897,8 +1897,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1927,15 +1927,15 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1943,8 +1943,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1973,15 +1973,15 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_cta( +define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -1989,8 +1989,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2019,15 +2019,15 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_gpu( +define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2035,8 +2035,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2065,15 +2065,15 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_sys( +define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2081,9 +2081,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2094,9 +2093,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2115,12 +2114,12 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_cta( +define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2128,9 +2127,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2141,9 +2139,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2162,12 +2160,12 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( +define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2175,9 +2173,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2188,9 +2185,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2209,12 +2206,12 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new } -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_sys( +define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2222,9 +2219,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2235,9 +2231,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2256,12 +2252,12 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_cta( +define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2269,9 +2265,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2282,9 +2277,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2300,15 +2295,15 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_gpu( +define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2316,9 +2311,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2329,9 +2323,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2347,15 +2341,15 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_sys( +define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2363,9 +2357,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2376,9 +2369,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2394,15 +2387,15 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_cta( +define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2410,9 +2403,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2423,9 +2415,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2441,15 +2433,15 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( +define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2457,9 +2449,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2470,9 +2461,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2488,15 +2479,15 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_sys( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2504,9 +2495,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2517,9 +2507,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2535,14 +2525,15 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_cta( +define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2550,9 +2541,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2563,9 +2553,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2581,14 +2571,15 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_gpu( +define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2596,9 +2587,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2609,9 +2599,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2627,14 +2617,15 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2642,9 +2633,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2655,9 +2645,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2673,14 +2663,15 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2688,9 +2679,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2701,9 +2691,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2719,14 +2709,15 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_gpu( +define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2734,9 +2725,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2747,9 +2737,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2765,14 +2755,15 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new } -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_sys( +define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2780,8 +2771,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2793,9 +2784,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2811,14 +2802,15 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_cta( +define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2826,9 +2818,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2839,9 +2831,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2857,14 +2849,15 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2872,9 +2865,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2885,9 +2878,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2903,14 +2896,15 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_sys( +define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2918,9 +2912,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2931,9 +2925,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2949,15 +2943,15 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_cta( +define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -2965,9 +2959,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -2978,9 +2972,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2996,15 +2990,15 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_gpu( +define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3012,9 +3006,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3025,9 +3019,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3043,15 +3037,15 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3059,9 +3053,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3072,9 +3066,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3090,15 +3084,15 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_cta( +define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3106,9 +3100,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3119,9 +3113,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3137,15 +3131,15 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_gpu( +define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3153,9 +3147,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3166,9 +3160,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3184,15 +3178,15 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_sys( +define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3200,8 +3194,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3213,9 +3207,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3234,12 +3228,12 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_cta( +define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3247,8 +3241,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3260,9 +3254,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3281,12 +3275,12 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_gpu( +define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3294,8 +3288,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3307,9 +3301,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3328,12 +3322,12 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new } -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_sys( +define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3341,8 +3335,8 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3354,9 +3348,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3372,15 +3366,14 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_cta( +define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3388,9 +3381,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3401,9 +3394,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3419,15 +3412,14 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_gpu( +define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3435,9 +3427,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3448,9 +3440,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3466,15 +3458,14 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_sys( +define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3482,9 +3473,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3495,9 +3486,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3513,15 +3504,14 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_cta( +define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3529,9 +3519,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3542,9 +3532,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3560,15 +3550,14 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_gpu( +define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3576,9 +3565,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3589,9 +3578,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3607,15 +3596,14 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_sys( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3623,9 +3611,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3636,9 +3624,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,15 +3642,14 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_cta( +define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3670,9 +3657,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3683,9 +3670,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3701,15 +3688,14 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_gpu( +define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3717,9 +3703,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3730,9 +3716,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3748,15 +3734,14 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( +define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3764,8 +3749,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3777,9 +3762,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3795,15 +3780,14 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( +define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3811,8 +3795,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3824,9 +3808,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3842,15 +3826,14 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( +define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3858,8 +3841,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3871,9 +3854,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3889,15 +3872,14 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_sys( +define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3905,8 +3887,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3918,9 +3900,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3939,12 +3921,12 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_cta( +define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3952,9 +3934,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -3965,9 +3947,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3983,15 +3965,15 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -3999,9 +3981,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4012,9 +3994,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4030,15 +4012,15 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( +define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4046,9 +4028,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4059,9 +4041,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4077,15 +4059,15 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( +define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4093,9 +4075,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4106,9 +4088,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4124,15 +4106,15 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( +define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4140,9 +4122,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4153,9 +4135,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4171,15 +4153,15 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_sys( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4187,9 +4169,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4200,33 +4182,33 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB90_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB90_1; ; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_cta( +define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4234,9 +4216,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4247,33 +4229,33 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB91_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB91_1; ; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( +define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4281,9 +4263,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4294,33 +4276,33 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB92_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB92_1; ; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_sys( +define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4328,8 +4310,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4341,33 +4323,33 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB93_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB93_1; ; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_cta( +define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4375,8 +4357,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4388,33 +4370,33 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB94_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB94_1; ; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_gpu( +define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4422,8 +4404,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4435,33 +4417,33 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB95_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB95_1; ; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_sys( +define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4469,8 +4451,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4482,33 +4464,33 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB96_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB96_1; ; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_cta( +define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4516,9 +4498,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4529,33 +4511,33 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB97_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB97_1; ; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( +define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4563,9 +4545,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4576,33 +4558,33 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB98_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB98_1; ; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( +define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4610,9 +4592,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4623,33 +4605,33 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB99_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB99_1; ; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( +define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4657,9 +4639,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4670,33 +4652,33 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB100_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB100_1; ; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( +define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4704,9 +4686,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4717,33 +4699,33 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB101_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB101_1; ; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4751,9 +4733,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4764,33 +4746,33 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB102_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB102_1; ; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4798,9 +4780,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4811,33 +4793,33 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB103_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB103_1; ; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4845,9 +4827,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -4858,33 +4840,33 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB104_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB104_1; ; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4892,8 +4874,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4905,33 +4887,33 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB105_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB105_1; ; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4939,8 +4921,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4952,33 +4934,33 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB106_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB106_1; ; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -4986,8 +4968,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4999,33 +4981,33 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB107_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB107_1; ; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5033,8 +5015,8 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5046,33 +5028,33 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB108_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB108_1; ; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5080,9 +5062,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5093,33 +5075,33 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB109_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB109_1; ; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5127,9 +5109,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5140,33 +5122,33 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB110_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB110_1; ; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5174,9 +5156,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5187,33 +5169,33 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB111_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB111_1; ; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5221,9 +5203,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5234,33 +5216,33 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB112_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB112_1; ; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5268,9 +5250,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5281,33 +5263,33 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB113_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB113_1; ; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5315,9 +5297,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5328,33 +5310,33 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB114_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB114_1; ; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5362,9 +5344,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5375,33 +5357,33 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB115_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB115_1; ; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5409,9 +5391,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5422,33 +5404,33 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB116_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB116_1; ; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5456,8 +5438,8 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5469,33 +5451,33 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB117_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB117_1; ; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5503,8 +5485,8 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5516,33 +5498,33 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB118_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB118_1; ; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5550,8 +5532,8 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5563,33 +5545,33 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB119_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB119_1; ; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new } -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_sys( +define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5597,8 +5579,8 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5610,33 +5592,33 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB120_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB120_1; ; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_cta( +define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5644,9 +5626,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5657,33 +5639,33 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB121_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB121_1; ; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5691,9 +5673,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5704,33 +5686,33 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB122_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB122_1; ; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5738,9 +5720,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5751,33 +5733,33 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB123_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB123_1; ; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5785,9 +5767,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5798,33 +5780,33 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB124_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB124_1; ; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5832,9 +5814,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5845,33 +5827,33 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB125_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB125_1; ; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5879,9 +5861,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5892,33 +5874,33 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB126_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB126_1; ; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5926,9 +5908,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5939,33 +5921,33 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB127_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB127_1; ; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -5973,9 +5955,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -5986,33 +5968,33 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB128_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB128_1; ; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6020,8 +6002,8 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6033,33 +6015,33 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB129_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB129_1; ; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6067,8 +6049,8 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6080,33 +6062,33 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB130_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB130_1; ; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6114,8 +6096,8 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6127,33 +6109,33 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB131_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB131_1; ; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6161,8 +6143,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6174,33 +6156,33 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB132_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB132_1; ; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6208,9 +6190,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6221,33 +6203,33 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB133_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB133_1; ; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new } -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6255,9 +6237,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -6268,121 +6250,4218 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB134_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM60-NEXT: mov.u32 %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB134_1; ; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB135_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB135_1; ; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<21>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB136_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB136_1; ; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new } -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( +define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB137_1; +; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB138_1; +; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB139_1; +; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB140_1; +; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB141_1; +; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB142_1; +; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst + ret i8 %new +} + +define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB143_1; +; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB144_1; +; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB145_1; +; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB146_1; +; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB147_1; +; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB148_1; +; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB149_1; +; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB150_1; +; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB151_1; +; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB152_1; +; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB153_1; +; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB154_1; +; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB155_1; +; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB156_1; +; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB157_1; +; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB158_1; +; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB159_1; +; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB160_1; +; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB161_1; +; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB162_1; +; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB163_1; +; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB164_1; +; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB165_1; +; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB166_1; +; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB167_1; +; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB168_1; +; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB169_1; +; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB170_1; +; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB171_1; +; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB172_1; +; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB173_1; +; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB174_1; +; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB175_1; +; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB176_1; +; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB177_1; +; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB178_1; +; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst + ret i8 %new +} + +define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r9, %rd2; +; SM60-NEXT: and.b32 %r10, %r9, 3; +; SM60-NEXT: shl.b32 %r1, %r10, 3; +; SM60-NEXT: mov.b32 %r11, 255; +; SM60-NEXT: shl.b32 %r12, %r11, %r1; +; SM60-NEXT: not.b32 %r2, %r12; +; SM60-NEXT: cvt.u32.u16 %r13, %rs1; +; SM60-NEXT: and.b32 %r14, %r13, 255; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB179_1; +; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst + ret i8 %new +} + +define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB180_1; +; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB181_1; +; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB182_1; +; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB183_1; +; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB184_1; +; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB185_1; +; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB186_1; +; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB187_1; +; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB188_1; +; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB189_1; +; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB190_1; +; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB191_1; +; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB192_1; +; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB193_1; +; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB194_1; +; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB195_1; +; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB196_1; +; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB197_1; +; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB198_1; +; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB199_1; +; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB200_1; +; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB201_1; +; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB202_1; +; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB203_1; +; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB204_1; +; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB205_1; +; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB206_1; +; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB207_1; +; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB208_1; +; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB209_1; +; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB210_1; +; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB211_1; +; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB212_1; +; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB213_1; +; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB214_1; +; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new +} + +define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB215_1; +; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB216_1; +; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB217_1; +; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB218_1; +; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB219_1; +; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB220_1; +; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB221_1; +; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB222_1; +; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_gpu( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB223_1; +; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB224_1; +; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_sys( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB225_1; +; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + ret i16 %new +} + +define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6390,10 +10469,10 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6403,30 +10482,31 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB137_3; +; SM60-NEXT: @%p1 bra $L__BB226_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB137_1; -; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB226_1; +; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_sys( +define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6434,10 +10514,10 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6447,30 +10527,31 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB138_3; +; SM60-NEXT: @%p1 bra $L__BB227_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB138_1; -; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB227_1; +; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic ret i16 %new } -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_cta( +define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6478,10 +10559,10 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6491,30 +10572,31 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB139_3; +; SM60-NEXT: @%p1 bra $L__BB228_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB139_1; -; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB228_1; +; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire ret i16 %new } -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_gpu( +define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6522,10 +10604,10 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6535,30 +10617,31 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB140_3; +; SM60-NEXT: @%p1 bra $L__BB229_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB140_1; -; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB229_1; +; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_sys( +define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6566,10 +10649,10 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6579,30 +10662,31 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB141_3; +; SM60-NEXT: @%p1 bra $L__BB230_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB141_1; -; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB230_1; +; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_cta( +define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6610,10 +10694,10 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6623,30 +10707,31 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB142_3; +; SM60-NEXT: @%p1 bra $L__BB231_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB142_1; -; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB231_1; +; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( +define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6654,10 +10739,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6667,30 +10752,31 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB143_3; +; SM60-NEXT: @%p1 bra $L__BB232_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB143_1; -; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB232_1; +; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_sys( +define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6698,10 +10784,10 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6711,31 +10797,31 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB144_3; +; SM60-NEXT: @%p1 bra $L__BB233_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB144_1; -; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB233_1; +; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_cta( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6743,10 +10829,10 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6756,31 +10842,31 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB145_3; +; SM60-NEXT: @%p1 bra $L__BB234_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB145_1; -; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB234_1; +; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_gpu( +define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6788,10 +10874,10 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6801,31 +10887,31 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB146_3; +; SM60-NEXT: @%p1 bra $L__BB235_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB146_1; -; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB235_1; +; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_sys( +define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6833,10 +10919,10 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6846,31 +10932,31 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB147_3; +; SM60-NEXT: @%p1 bra $L__BB236_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB147_1; -; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB236_1; +; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_cta( +define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6878,10 +10964,10 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6891,31 +10977,31 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB148_3; +; SM60-NEXT: @%p1 bra $L__BB237_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB148_1; -; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB237_1; +; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_gpu( +define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6923,10 +11009,10 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6936,31 +11022,31 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB149_3; +; SM60-NEXT: @%p1 bra $L__BB238_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB149_1; -; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB238_1; +; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_sys( +define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -6968,10 +11054,10 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -6981,31 +11067,31 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB150_3; +; SM60-NEXT: @%p1 bra $L__BB239_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB150_1; -; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB239_1; +; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire ret i16 %new } -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_cta( +define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7013,10 +11099,11 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7026,31 +11113,31 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB151_3; +; SM60-NEXT: @%p1 bra $L__BB240_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB151_1; -; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB240_1; +; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_gpu( +define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7058,10 +11145,11 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7071,31 +11159,31 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB152_3; +; SM60-NEXT: @%p1 bra $L__BB241_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB152_1; -; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB241_1; +; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( +define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7103,10 +11191,10 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7117,31 +11205,31 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB153_3; +; SM60-NEXT: @%p1 bra $L__BB242_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB153_1; -; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB242_1; +; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( +define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7149,10 +11237,10 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7163,31 +11251,31 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB154_3; +; SM60-NEXT: @%p1 bra $L__BB243_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB154_1; -; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB243_1; +; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( +define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7195,10 +11283,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7209,31 +11297,31 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB155_3; +; SM60-NEXT: @%p1 bra $L__BB244_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB155_1; -; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB244_1; +; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_sys( +define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7241,10 +11329,10 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7255,31 +11343,31 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB156_3; +; SM60-NEXT: @%p1 bra $L__BB245_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB156_1; -; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB245_1; +; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_cta( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7287,10 +11375,10 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7301,31 +11389,31 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB157_3; +; SM60-NEXT: @%p1 bra $L__BB246_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB157_1; -; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB246_1; +; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( +define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7333,10 +11421,10 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7347,31 +11435,31 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB158_3; +; SM60-NEXT: @%p1 bra $L__BB247_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB158_1; -; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB247_1; +; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( +define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7379,10 +11467,10 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7393,31 +11481,31 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB159_3; +; SM60-NEXT: @%p1 bra $L__BB248_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB159_1; -; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB248_1; +; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( +define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7425,10 +11513,10 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7439,31 +11527,31 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB160_3; +; SM60-NEXT: @%p1 bra $L__BB249_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB160_1; -; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB249_1; +; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst ret i16 %new } -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( +define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7471,10 +11559,10 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -7485,31 +11573,31 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB161_3; +; SM60-NEXT: @%p1 bra $L__BB250_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB161_1; -; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB250_1; +; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new } -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_sys( +define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7517,10 +11605,11 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7530,31 +11619,31 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB162_3; +; SM60-NEXT: @%p1 bra $L__BB251_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB162_1; -; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB251_1; +; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst ret i16 %new } -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_cta( +define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7562,10 +11651,11 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7575,31 +11665,30 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB163_3; +; SM60-NEXT: @%p1 bra $L__BB252_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB163_1; -; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB252_1; +; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_gpu( +define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7607,10 +11696,11 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7620,31 +11710,30 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB164_3; +; SM60-NEXT: @%p1 bra $L__BB253_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB164_1; -; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB253_1; +; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_sys( +define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7652,10 +11741,11 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7665,31 +11755,30 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB165_3; +; SM60-NEXT: @%p1 bra $L__BB254_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB165_1; -; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB254_1; +; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_cta( +define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7697,10 +11786,11 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7710,31 +11800,30 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB166_3; +; SM60-NEXT: @%p1 bra $L__BB255_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB166_1; -; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB255_1; +; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_gpu( +define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7742,10 +11831,11 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7755,31 +11845,30 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB167_3; +; SM60-NEXT: @%p1 bra $L__BB256_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB167_1; -; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB256_1; +; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_sys( +define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7787,10 +11876,11 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7800,31 +11890,30 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB168_3; +; SM60-NEXT: @%p1 bra $L__BB257_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB168_1; -; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB257_1; +; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_cta( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7832,10 +11921,11 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7845,31 +11935,30 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB169_3; +; SM60-NEXT: @%p1 bra $L__BB258_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB169_1; -; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB258_1; +; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_gpu( +define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7877,10 +11966,11 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7890,31 +11980,30 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB170_3; +; SM60-NEXT: @%p1 bra $L__BB259_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB170_1; -; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB259_1; +; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_sys( +define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7922,10 +12011,11 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7935,31 +12025,30 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB171_3; +; SM60-NEXT: @%p1 bra $L__BB260_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB171_1; -; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB260_1; +; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_cta( +define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -7967,10 +12056,11 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -7980,31 +12070,30 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB172_3; +; SM60-NEXT: @%p1 bra $L__BB261_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB172_1; -; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB261_1; +; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic ret i16 %new } -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_gpu( +define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8012,10 +12101,11 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8025,31 +12115,30 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB173_3; +; SM60-NEXT: @%p1 bra $L__BB262_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB173_1; -; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB262_1; +; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new } -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_sys( +define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8057,10 +12146,11 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8070,31 +12160,30 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB174_3; +; SM60-NEXT: @%p1 bra $L__BB263_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB174_1; -; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB263_1; +; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic ret i16 %new } -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_cta( +define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8102,10 +12191,11 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8115,31 +12205,31 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB175_3; +; SM60-NEXT: @%p1 bra $L__BB264_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB175_1; -; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB264_1; +; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire ret i16 %new } -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_gpu( +define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8147,10 +12237,11 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8160,31 +12251,31 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB176_3; +; SM60-NEXT: @%p1 bra $L__BB265_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB176_1; -; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB265_1; +; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_sys( +define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8192,10 +12283,11 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8205,31 +12297,31 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB177_3; +; SM60-NEXT: @%p1 bra $L__BB266_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB177_1; -; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB266_1; +; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_cta( +define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8237,10 +12329,11 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8250,31 +12343,31 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB178_3; +; SM60-NEXT: @%p1 bra $L__BB267_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB178_1; -; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB267_1; +; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_gpu( +define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8282,10 +12375,11 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -8295,31 +12389,31 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB179_3; +; SM60-NEXT: @%p1 bra $L__BB268_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB179_1; -; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB268_1; +; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_sys( +define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8327,10 +12421,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8341,31 +12435,31 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB180_3; +; SM60-NEXT: @%p1 bra $L__BB269_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB180_1; -; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB269_1; +; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_cta( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8373,10 +12467,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8387,31 +12481,31 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB181_3; +; SM60-NEXT: @%p1 bra $L__BB270_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB181_1; -; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB270_1; +; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( +define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8419,10 +12513,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8433,31 +12527,31 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB182_3; +; SM60-NEXT: @%p1 bra $L__BB271_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB182_1; -; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB271_1; +; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_sys( +define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8465,10 +12559,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8479,31 +12573,31 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB272_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB183_3; +; SM60-NEXT: @%p1 bra $L__BB272_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB183_1; -; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB272_1; +; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_cta( +define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8511,10 +12605,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8525,31 +12619,31 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB184_3; +; SM60-NEXT: @%p1 bra $L__BB273_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB184_1; -; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB273_1; +; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_gpu( +define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8557,10 +12651,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8571,31 +12665,31 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB185_3; +; SM60-NEXT: @%p1 bra $L__BB274_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB185_1; -; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB274_1; +; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_sys( +define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8603,10 +12697,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8617,31 +12711,31 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB275_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB186_3; +; SM60-NEXT: @%p1 bra $L__BB275_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB186_1; -; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB275_1; +; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_cta( +define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8649,10 +12743,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8663,31 +12757,31 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB187_3; +; SM60-NEXT: @%p1 bra $L__BB276_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB187_1; -; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB276_1; +; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( +define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8695,10 +12789,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8709,31 +12803,31 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB188_3; +; SM60-NEXT: @%p1 bra $L__BB277_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB188_1; -; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB277_1; +; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_sys( +define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8741,10 +12835,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8755,30 +12849,31 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB189_3; +; SM60-NEXT: @%p1 bra $L__BB278_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB189_1; -; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB278_1; +; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_cta( +define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8786,10 +12881,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8800,30 +12895,31 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB190_3; +; SM60-NEXT: @%p1 bra $L__BB279_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB190_1; -; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB279_1; +; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_gpu( +define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8831,10 +12927,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8845,30 +12941,31 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB280_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB191_3; +; SM60-NEXT: @%p1 bra $L__BB280_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB191_1; -; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB280_1; +; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_sys( +define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8876,10 +12973,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8890,30 +12987,31 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB192_3; +; SM60-NEXT: @%p1 bra $L__BB281_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB192_1; -; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB281_1; +; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_cta( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8921,10 +13019,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8935,30 +13033,31 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB193_3; +; SM60-NEXT: @%p1 bra $L__BB282_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB193_1; -; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB282_1; +; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_gpu( +define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -8966,10 +13065,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -8980,30 +13079,31 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB194_3; +; SM60-NEXT: @%p1 bra $L__BB283_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB194_1; -; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB283_1; +; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_sys( +define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9011,10 +13111,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9025,30 +13125,31 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB284_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB195_3; +; SM60-NEXT: @%p1 bra $L__BB284_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB195_1; -; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB284_1; +; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_cta( +define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9056,10 +13157,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9070,30 +13171,31 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB196_3; +; SM60-NEXT: @%p1 bra $L__BB285_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB196_1; -; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB285_1; +; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst ret i16 %new } -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_gpu( +define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9101,10 +13203,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9115,30 +13217,31 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB197_3; +; SM60-NEXT: @%p1 bra $L__BB286_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB197_1; -; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB286_1; +; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new } -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_sys( +define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9146,10 +13249,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9160,31 +13263,31 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB198_3; +; SM60-NEXT: @%p1 bra $L__BB287_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB198_1; -; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB287_1; +; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst ret i16 %new } -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_cta( +define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9192,10 +13295,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9206,31 +13309,31 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB288_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB199_3; +; SM60-NEXT: @%p1 bra $L__BB288_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB199_1; -; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB288_1; +; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9238,10 +13341,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9252,31 +13355,31 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB200_3; +; SM60-NEXT: @%p1 bra $L__BB289_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB200_1; -; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB289_1; +; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_sys( +define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9284,10 +13387,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9298,31 +13401,31 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB201_3; +; SM60-NEXT: @%p1 bra $L__BB290_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB201_1; -; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB290_1; +; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_cta( +define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9330,10 +13433,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9344,31 +13447,31 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB202_3; +; SM60-NEXT: @%p1 bra $L__BB291_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB202_1; -; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB291_1; +; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_gpu( +define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9376,10 +13479,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9390,31 +13493,31 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB292_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB203_3; +; SM60-NEXT: @%p1 bra $L__BB292_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB203_1; -; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB292_1; +; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_sys( +define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9422,10 +13525,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9436,31 +13539,31 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB204_3; +; SM60-NEXT: @%p1 bra $L__BB293_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB204_1; -; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB293_1; +; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_cta( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9468,10 +13571,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9482,31 +13585,31 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB205_3; +; SM60-NEXT: @%p1 bra $L__BB294_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB205_1; -; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB294_1; +; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_gpu( +define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9514,10 +13617,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9528,31 +13631,31 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB295_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB206_3; +; SM60-NEXT: @%p1 bra $L__BB295_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB206_1; -; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB295_1; +; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_sys( +define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9560,10 +13663,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9574,31 +13677,31 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB207_3; +; SM60-NEXT: @%p1 bra $L__BB296_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB207_1; -; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB296_1; +; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_cta( +define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9606,10 +13709,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9620,31 +13723,31 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB208_3; +; SM60-NEXT: @%p1 bra $L__BB297_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB208_1; -; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB297_1; +; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic ret i16 %new } -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_gpu( +define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9652,10 +13755,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9666,31 +13769,31 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB209_3; +; SM60-NEXT: @%p1 bra $L__BB298_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB209_1; -; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB298_1; +; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_sys( +define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9698,10 +13801,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9712,31 +13815,31 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB210_3; +; SM60-NEXT: @%p1 bra $L__BB299_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB210_1; -; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB299_1; +; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic ret i16 %new } -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_cta( +define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9744,10 +13847,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9758,31 +13861,31 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB300_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB211_3; +; SM60-NEXT: @%p1 bra $L__BB300_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB211_1; -; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB300_1; +; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire ret i16 %new } -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_gpu( +define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9790,10 +13893,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9804,31 +13907,31 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB212_3; +; SM60-NEXT: @%p1 bra $L__BB301_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB212_1; -; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB301_1; +; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_sys( +define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9836,10 +13939,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9850,31 +13953,31 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB213_3; +; SM60-NEXT: @%p1 bra $L__BB302_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB213_1; -; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB302_1; +; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_cta( +define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9882,10 +13985,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9896,31 +13999,31 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB214_3; +; SM60-NEXT: @%p1 bra $L__BB303_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB214_1; -; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB303_1; +; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_gpu( +define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9928,10 +14031,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9942,31 +14045,31 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB304_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB215_3; +; SM60-NEXT: @%p1 bra $L__BB304_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB215_1; -; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB304_1; +; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( +define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -9974,10 +14077,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -9988,31 +14091,31 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB216_3; +; SM60-NEXT: @%p1 bra $L__BB305_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB216_1; -; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB305_1; +; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10020,10 +14123,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10034,31 +14137,31 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB217_3; +; SM60-NEXT: @%p1 bra $L__BB306_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB217_1; -; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB306_1; +; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( +define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10066,10 +14169,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10080,31 +14183,31 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB218_3; +; SM60-NEXT: @%p1 bra $L__BB307_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB218_1; -; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB307_1; +; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_sys( +define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10112,10 +14215,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10126,31 +14229,31 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB308_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB219_3; +; SM60-NEXT: @%p1 bra $L__BB308_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB219_1; -; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB308_1; +; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_cta( +define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10158,10 +14261,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10172,31 +14275,31 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB220_3; +; SM60-NEXT: @%p1 bra $L__BB309_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB220_1; -; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB309_1; +; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( +define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10204,10 +14307,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10218,31 +14321,31 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB221_3; +; SM60-NEXT: @%p1 bra $L__BB310_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB221_1; -; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB310_1; +; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( +define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10250,10 +14353,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10264,31 +14367,31 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB222_3; +; SM60-NEXT: @%p1 bra $L__BB311_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB222_1; -; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB311_1; +; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( +define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10296,10 +14399,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10310,31 +14413,31 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB312_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB223_3; +; SM60-NEXT: @%p1 bra $L__BB312_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB223_1; -; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB312_1; +; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( +define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10342,10 +14445,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10356,31 +14459,31 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB224_3; +; SM60-NEXT: @%p1 bra $L__BB313_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB224_1; -; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB313_1; +; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_sys( +define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10388,10 +14491,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10402,31 +14505,31 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB225_3; +; SM60-NEXT: @%p1 bra $L__BB314_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB225_1; -; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB314_1; +; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_cta( +define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10434,10 +14537,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10448,31 +14551,31 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB315_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB226_3; +; SM60-NEXT: @%p1 bra $L__BB315_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB226_1; -; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB315_1; +; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( +define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10480,10 +14583,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10494,31 +14597,31 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB227_3; +; SM60-NEXT: @%p1 bra $L__BB316_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB227_1; -; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB316_1; +; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_sys( +define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10526,10 +14629,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10540,31 +14643,31 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB228_3; +; SM60-NEXT: @%p1 bra $L__BB317_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB228_1; -; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB317_1; +; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_cta( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10572,10 +14675,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10586,31 +14689,31 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB229_3; +; SM60-NEXT: @%p1 bra $L__BB318_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB229_1; -; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB318_1; +; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_gpu( +define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10618,10 +14721,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10632,31 +14735,31 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB230_3; +; SM60-NEXT: @%p1 bra $L__BB319_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB230_1; -; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB319_1; +; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_sys( +define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10664,10 +14767,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10678,31 +14781,31 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB320_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB231_3; +; SM60-NEXT: @%p1 bra $L__BB320_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB231_1; -; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB320_1; +; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_cta( +define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10710,10 +14813,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10724,31 +14827,31 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB232_3; +; SM60-NEXT: @%p1 bra $L__BB321_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB232_1; -; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB321_1; +; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( +define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10756,10 +14859,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10770,31 +14873,31 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB233_3; +; SM60-NEXT: @%p1 bra $L__BB322_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB233_1; -; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB322_1; +; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( +define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10802,10 +14905,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10816,31 +14919,31 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB234_3; +; SM60-NEXT: @%p1 bra $L__BB323_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB234_1; -; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB323_1; +; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( +define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10848,10 +14951,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10862,31 +14965,31 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB324_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB235_3; +; SM60-NEXT: @%p1 bra $L__BB324_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB235_1; -; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB324_1; +; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( +define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10894,10 +14997,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10908,31 +15011,31 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB236_3; +; SM60-NEXT: @%p1 bra $L__BB325_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB236_1; -; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB325_1; +; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( +define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10940,10 +15043,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -10954,31 +15057,31 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB237_3; +; SM60-NEXT: @%p1 bra $L__BB326_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB237_1; -; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB326_1; +; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( +define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -10986,10 +15089,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11000,31 +15103,31 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB238_3; +; SM60-NEXT: @%p1 bra $L__BB327_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB238_1; -; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB327_1; +; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( +define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11032,10 +15135,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11046,31 +15149,31 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB328_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB239_3; +; SM60-NEXT: @%p1 bra $L__BB328_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB239_1; -; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB328_1; +; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( +define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11078,10 +15181,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11092,31 +15195,31 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB240_3; +; SM60-NEXT: @%p1 bra $L__BB329_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB240_1; -; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB329_1; +; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11124,10 +15227,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11138,31 +15241,31 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB241_3; +; SM60-NEXT: @%p1 bra $L__BB330_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB241_1; -; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB330_1; +; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( +define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11170,10 +15273,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11184,31 +15287,31 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB242_3; +; SM60-NEXT: @%p1 bra $L__BB331_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB242_1; -; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB331_1; +; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( +define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11216,10 +15319,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11230,31 +15333,31 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB332_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB243_3; +; SM60-NEXT: @%p1 bra $L__BB332_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB243_1; -; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB332_1; +; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( +define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11262,10 +15365,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11276,31 +15379,31 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB244_3; +; SM60-NEXT: @%p1 bra $L__BB333_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB244_1; -; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB333_1; +; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( +define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11308,10 +15411,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11322,31 +15425,31 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB245_3; +; SM60-NEXT: @%p1 bra $L__BB334_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB245_1; -; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB334_1; +; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_sys( +define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11354,10 +15457,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11368,31 +15471,31 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB335_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB246_3; +; SM60-NEXT: @%p1 bra $L__BB335_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB246_1; -; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB335_1; +; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_cta( +define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11400,10 +15503,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11414,31 +15517,31 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB247_3; +; SM60-NEXT: @%p1 bra $L__BB336_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB247_1; -; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB336_1; +; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( +define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11446,10 +15549,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11460,31 +15563,31 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB248_3; +; SM60-NEXT: @%p1 bra $L__BB337_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB248_1; -; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB337_1; +; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( +define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11492,10 +15595,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11506,31 +15609,31 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB249_3; +; SM60-NEXT: @%p1 bra $L__BB338_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB249_1; -; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB338_1; +; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( +define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11538,10 +15641,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11552,31 +15655,31 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB250_3; +; SM60-NEXT: @%p1 bra $L__BB339_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB250_1; -; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB339_1; +; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( +define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11584,10 +15687,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11598,31 +15701,31 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB340_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB251_3; +; SM60-NEXT: @%p1 bra $L__BB340_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB251_1; -; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB340_1; +; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_sys( +define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11630,10 +15733,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11644,31 +15747,31 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB252_3; +; SM60-NEXT: @%p1 bra $L__BB341_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB252_1; -; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB341_1; +; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_cta( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11676,10 +15779,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11690,31 +15793,31 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB253_3; +; SM60-NEXT: @%p1 bra $L__BB342_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB253_1; -; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB342_1; +; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( +define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11722,10 +15825,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11736,31 +15839,31 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB254_3; +; SM60-NEXT: @%p1 bra $L__BB343_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB254_1; -; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB343_1; +; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_sys( +define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11768,10 +15871,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11782,31 +15885,31 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB344_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB255_3; +; SM60-NEXT: @%p1 bra $L__BB344_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB255_1; -; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB344_1; +; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_cta( +define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11814,10 +15917,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11828,31 +15931,31 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB256_3; +; SM60-NEXT: @%p1 bra $L__BB345_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB256_1; -; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB345_1; +; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_gpu( +define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11860,10 +15963,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11874,31 +15977,31 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB257_3; +; SM60-NEXT: @%p1 bra $L__BB346_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB257_1; -; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB346_1; +; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_sys( +define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11906,10 +16009,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11920,31 +16023,31 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB258_3; +; SM60-NEXT: @%p1 bra $L__BB347_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB258_1; -; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB347_1; +; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire ret i16 %new } -define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_cta( +define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11952,10 +16055,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -11966,31 +16069,31 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB348_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB259_3; +; SM60-NEXT: @%p1 bra $L__BB348_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB259_1; -; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB348_1; +; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } -define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( +define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11998,10 +16101,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12012,31 +16115,31 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB260_3; +; SM60-NEXT: @%p1 bra $L__BB349_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB260_1; -; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB349_1; +; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( +define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12044,10 +16147,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12058,31 +16161,31 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB261_3; +; SM60-NEXT: @%p1 bra $L__BB350_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB261_1; -; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB350_1; +; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( +define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12090,10 +16193,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12104,31 +16207,31 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB262_3; +; SM60-NEXT: @%p1 bra $L__BB351_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB262_1; -; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB351_1; +; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst ret i16 %new } -define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( +define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -12136,10 +16239,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12150,26 +16253,26 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB352_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB263_3; +; SM60-NEXT: @%p1 bra $L__BB352_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB263_1; -; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB352_1; +; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst ret i16 %new } @@ -12182,10 +16285,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12196,22 +16299,22 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB264_3; +; SM60-NEXT: @%p1 bra $L__BB353_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB264_1; -; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB353_1; +; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12228,10 +16331,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12242,22 +16345,22 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB265_3; +; SM60-NEXT: @%p1 bra $L__BB354_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB265_1; -; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB354_1; +; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12274,10 +16377,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12288,22 +16391,22 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB355_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB266_3; +; SM60-NEXT: @%p1 bra $L__BB355_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB266_1; -; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB355_1; +; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12311,6 +16414,52 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ret i16 %new } +define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_shared( +; SM60: { +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 65535; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: $L__BB356_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB356_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB356_1; +; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst + ret i16 %new +} + define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { ; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( ; SM60: { @@ -12320,10 +16469,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12334,22 +16483,22 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB267_3; +; SM60-NEXT: @%p1 bra $L__BB357_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB267_1; -; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB357_1; +; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12366,10 +16515,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12380,22 +16529,22 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB268_3; +; SM60-NEXT: @%p1 bra $L__BB358_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB268_1; -; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB358_1; +; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12412,10 +16561,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -12426,22 +16575,22 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop +; SM60-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; ; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB269_3; +; SM60-NEXT: @%p1 bra $L__BB359_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 +; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; ; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM60-NEXT: mov.u32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB269_1; -; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end +; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: @%p2 bra $L__BB359_1; +; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; @@ -12449,6 +16598,23 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ret i16 %new } +define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_generic_sys( ; SM60: { @@ -12456,10 +16622,10 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic @@ -12473,10 +16639,10 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic @@ -12490,16 +16656,33 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic ret i32 %new } +define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_global_sys( ; SM60: { @@ -12507,10 +16690,10 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic @@ -12524,10 +16707,10 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic @@ -12541,16 +16724,33 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic ret i32 %new } +define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic + ret i32 %new +} + define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_monotonic_i32_shared_sys( ; SM60: { @@ -12558,10 +16758,10 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic @@ -12575,10 +16775,10 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic @@ -12592,16 +16792,33 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic ret i32 %new } +define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_generic_sys( ; SM60: { @@ -12609,10 +16826,10 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire @@ -12626,10 +16843,10 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire @@ -12643,16 +16860,33 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire ret i32 %new } +define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_global_sys( ; SM60: { @@ -12660,10 +16894,10 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire @@ -12677,10 +16911,10 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire @@ -12694,16 +16928,33 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire ret i32 %new } +define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire + ret i32 %new +} + define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_acquire_i32_shared_sys( ; SM60: { @@ -12711,10 +16962,10 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire @@ -12728,10 +16979,10 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire @@ -12745,16 +16996,34 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire ret i32 %new } +define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( ; SM60: { @@ -12762,11 +17031,11 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst @@ -12780,11 +17049,11 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst @@ -12798,17 +17067,35 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst ret i32 %new } +define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_global_sys( ; SM60: { @@ -12816,11 +17103,11 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst @@ -12834,11 +17121,11 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst @@ -12852,17 +17139,35 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst ret i32 %new } +define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst + ret i32 %new +} + define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( ; SM60: { @@ -12870,11 +17175,11 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst @@ -12888,11 +17193,11 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst @@ -12906,17 +17211,34 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst ret i32 %new } +define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_generic_sys( ; SM60: { @@ -12924,10 +17246,10 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic @@ -12941,10 +17263,10 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic @@ -12958,16 +17280,33 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic ret i32 %new } +define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_global_sys( ; SM60: { @@ -12975,10 +17314,10 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic @@ -12992,10 +17331,10 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic @@ -13009,16 +17348,33 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic ret i32 %new } +define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic + ret i32 %new +} + define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_monotonic_i32_shared_sys( ; SM60: { @@ -13026,10 +17382,10 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic @@ -13043,10 +17399,10 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic @@ -13060,16 +17416,33 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic ret i32 %new } +define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_generic_sys( ; SM60: { @@ -13077,10 +17450,10 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire @@ -13094,10 +17467,10 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire @@ -13111,16 +17484,33 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire ret i32 %new } +define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire + ret i32 %new +} + define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_acquire_i32_global_sys( ; SM60: { @@ -13128,10 +17518,10 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire @@ -13145,10 +17535,10 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire @@ -13162,13 +17552,30 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + ret i32 %new +} + +define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire ret i32 %new } @@ -13179,10 +17586,10 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire @@ -13196,10 +17603,10 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire @@ -13213,16 +17620,34 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire ret i32 %new } +define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_generic_sys( ; SM60: { @@ -13230,11 +17655,11 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst @@ -13248,11 +17673,11 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst @@ -13266,17 +17691,35 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst ret i32 %new } +define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_global_sys( ; SM60: { @@ -13284,11 +17727,11 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst @@ -13302,11 +17745,11 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst @@ -13320,17 +17763,35 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst ret i32 %new } +define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst + ret i32 %new +} + define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acquire_seq_cst_i32_shared_sys( ; SM60: { @@ -13338,11 +17799,11 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst @@ -13356,11 +17817,11 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst @@ -13374,17 +17835,34 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst ret i32 %new } +define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_generic_sys( ; SM60: { @@ -13392,10 +17870,10 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic @@ -13409,10 +17887,10 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic @@ -13426,16 +17904,33 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } +define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_global_sys( ; SM60: { @@ -13443,10 +17938,10 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic @@ -13460,10 +17955,10 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic @@ -13477,16 +17972,33 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } +define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic + ret i32 %new +} + define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_monotonic_i32_shared_sys( ; SM60: { @@ -13494,10 +18006,10 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic @@ -13511,10 +18023,10 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic @@ -13528,16 +18040,33 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic ret i32 %new } +define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_generic_sys( ; SM60: { @@ -13545,10 +18074,10 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire @@ -13562,10 +18091,10 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire @@ -13579,16 +18108,33 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire ret i32 %new } +define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_global_sys( ; SM60: { @@ -13596,10 +18142,10 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire @@ -13613,10 +18159,10 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire @@ -13630,16 +18176,33 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire ret i32 %new } +define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire + ret i32 %new +} + define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_acquire_i32_shared_sys( ; SM60: { @@ -13647,10 +18210,10 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire @@ -13664,10 +18227,10 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire @@ -13681,16 +18244,34 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire ret i32 %new } +define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_generic_sys( ; SM60: { @@ -13698,11 +18279,11 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst @@ -13716,11 +18297,11 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst @@ -13734,17 +18315,35 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst ret i32 %new } +define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_global_sys( ; SM60: { @@ -13752,11 +18351,11 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst @@ -13770,11 +18369,11 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst @@ -13788,17 +18387,35 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst ret i32 %new } +define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst + ret i32 %new +} + define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: release_seq_cst_i32_shared_sys( ; SM60: { @@ -13806,11 +18423,11 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst @@ -13824,11 +18441,11 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst @@ -13842,17 +18459,34 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst ret i32 %new } +define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( ; SM60: { @@ -13860,10 +18494,10 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic @@ -13877,10 +18511,10 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic @@ -13894,16 +18528,33 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic ret i32 %new } +define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_global_sys( ; SM60: { @@ -13911,10 +18562,10 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic @@ -13928,10 +18579,10 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic @@ -13945,16 +18596,33 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic ret i32 %new } +define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic + ret i32 %new +} + define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( ; SM60: { @@ -13962,10 +18630,10 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic @@ -13979,10 +18647,10 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic @@ -13996,16 +18664,33 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic ret i32 %new } +define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_generic_sys( ; SM60: { @@ -14013,10 +18698,10 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire @@ -14030,10 +18715,10 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire @@ -14047,16 +18732,33 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire ret i32 %new } +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_global_sys( ; SM60: { @@ -14064,10 +18766,10 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire @@ -14081,10 +18783,10 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire @@ -14098,16 +18800,33 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire ret i32 %new } +define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new +} + define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_acquire_i32_shared_sys( ; SM60: { @@ -14115,10 +18834,10 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire @@ -14132,10 +18851,10 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire @@ -14149,16 +18868,34 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire ret i32 %new } +define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( ; SM60: { @@ -14166,11 +18903,11 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst @@ -14184,11 +18921,11 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst @@ -14202,17 +18939,35 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst ret i32 %new } +define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( ; SM60: { @@ -14220,11 +18975,11 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst @@ -14238,11 +18993,11 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst @@ -14256,17 +19011,35 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst ret i32 %new } +define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst + ret i32 %new +} + define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( ; SM60: { @@ -14274,11 +19047,11 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst @@ -14292,11 +19065,11 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst @@ -14310,17 +19083,35 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst ret i32 %new } +define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( ; SM60: { @@ -14328,11 +19119,11 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic @@ -14346,11 +19137,11 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic @@ -14364,17 +19155,35 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic ret i32 %new } +define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic + ret i32 %new +} + define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_monotonic_i32_global_sys( ; SM60: { @@ -14382,11 +19191,11 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic @@ -14400,32 +19209,50 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new +} + +define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic ret i32 %new } -define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( +define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_shared( ; SM60: { ; SM60-NEXT: .reg .b32 %r<4>; ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic ret i32 %new } @@ -14436,11 +19263,11 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic @@ -14454,11 +19281,11 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic @@ -14472,17 +19299,35 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic ret i32 %new } +define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_generic_sys( ; SM60: { @@ -14490,11 +19335,11 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire @@ -14508,11 +19353,11 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire @@ -14526,17 +19371,35 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } +define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_global_sys( ; SM60: { @@ -14544,11 +19407,11 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire @@ -14562,11 +19425,11 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire @@ -14580,17 +19443,35 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } +define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire + ret i32 %new +} + define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_acquire_i32_shared_sys( ; SM60: { @@ -14598,11 +19479,11 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire @@ -14616,11 +19497,11 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire @@ -14634,17 +19515,35 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire ret i32 %new } +define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_generic( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( ; SM60: { @@ -14652,11 +19551,11 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst @@ -14670,11 +19569,11 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst @@ -14688,17 +19587,35 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst ret i32 %new } +define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( ; SM60: { @@ -14706,11 +19623,11 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst @@ -14724,11 +19641,11 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst @@ -14742,17 +19659,35 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst ret i32 %new } +define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_shared( +; SM60: { +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { ; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( ; SM60: { @@ -14760,11 +19695,11 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst @@ -14778,11 +19713,11 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst @@ -14796,27 +19731,43 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst ret i32 %new } +define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic @@ -14829,10 +19780,10 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic @@ -14845,26 +19796,42 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic ret i64 %new } +define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic @@ -14877,10 +19844,10 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic @@ -14893,26 +19860,42 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic ret i64 %new } +define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic + ret i64 %new +} + define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic @@ -14925,10 +19908,10 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic @@ -14941,26 +19924,42 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic ret i64 %new } +define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire @@ -14973,10 +19972,10 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire @@ -14989,26 +19988,42 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire ret i64 %new } +define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire @@ -15021,10 +20036,10 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire @@ -15037,26 +20052,42 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire ret i64 %new } +define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire + ret i64 %new +} + define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire @@ -15069,10 +20100,10 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire @@ -15085,27 +20116,44 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire ret i64 %new } +define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst @@ -15118,11 +20166,11 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst @@ -15135,28 +20183,45 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst ret i64 %new } +define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst @@ -15169,11 +20234,11 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst @@ -15186,28 +20251,45 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst ret i64 %new } +define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst + ret i64 %new +} + define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst @@ -15220,11 +20302,11 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst @@ -15237,27 +20319,43 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst ret i64 %new } +define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic @@ -15270,10 +20368,10 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic @@ -15286,26 +20384,42 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic ret i64 %new } +define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic @@ -15318,10 +20432,10 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic @@ -15334,26 +20448,42 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic ret i64 %new } +define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic + ret i64 %new +} + define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic @@ -15366,10 +20496,10 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic @@ -15382,26 +20512,42 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic ret i64 %new } +define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire @@ -15414,10 +20560,10 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire @@ -15430,26 +20576,42 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire ret i64 %new } +define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire @@ -15462,10 +20624,10 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire @@ -15478,26 +20640,42 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire ret i64 %new } +define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire + ret i64 %new +} + define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire @@ -15510,10 +20688,10 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire @@ -15526,27 +20704,44 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire ret i64 %new } +define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst @@ -15559,11 +20754,11 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst @@ -15576,28 +20771,45 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst ret i64 %new } +define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst @@ -15610,11 +20822,11 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst @@ -15627,28 +20839,45 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst ret i64 %new } +define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst + ret i64 %new +} + define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acquire_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst @@ -15661,31 +20890,47 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + ret i64 %new +} + +define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst ret i64 %new } -define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( +define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_generic( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic ret i64 %new } @@ -15695,10 +20940,10 @@ define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic @@ -15711,10 +20956,10 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic @@ -15727,26 +20972,42 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } +define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic @@ -15759,10 +21020,10 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic @@ -15775,26 +21036,42 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } +define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic + ret i64 %new +} + define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic @@ -15807,10 +21084,10 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic @@ -15823,26 +21100,42 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic ret i64 %new } +define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire @@ -15855,10 +21148,10 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire @@ -15871,26 +21164,42 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire ret i64 %new } +define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire @@ -15903,10 +21212,10 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire @@ -15919,26 +21228,42 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire ret i64 %new } +define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire + ret i64 %new +} + define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire @@ -15951,10 +21276,10 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire @@ -15967,27 +21292,44 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire ret i64 %new } +define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst @@ -16000,11 +21342,11 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst @@ -16017,28 +21359,45 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst ret i64 %new } +define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst @@ -16051,11 +21410,11 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst @@ -16068,28 +21427,45 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst ret i64 %new } +define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst + ret i64 %new +} + define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: release_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst @@ -16102,11 +21478,11 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst @@ -16119,27 +21495,43 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst ret i64 %new } +define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic @@ -16152,10 +21544,10 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic @@ -16168,26 +21560,42 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic ret i64 %new } +define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic @@ -16200,10 +21608,10 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic @@ -16216,26 +21624,42 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic ret i64 %new } +define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic + ret i64 %new +} + define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic @@ -16248,10 +21672,10 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic @@ -16264,26 +21688,42 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic ret i64 %new } +define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire @@ -16296,10 +21736,10 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire @@ -16312,26 +21752,42 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } +define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire @@ -16344,10 +21800,10 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire @@ -16360,26 +21816,42 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } +define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire + ret i64 %new +} + define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire @@ -16392,10 +21864,10 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire @@ -16408,27 +21880,44 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire ret i64 %new } +define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst @@ -16441,11 +21930,11 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst @@ -16458,28 +21947,45 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } +define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst @@ -16492,11 +21998,11 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst @@ -16509,28 +22015,45 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } +define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst + ret i64 %new +} + define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst @@ -16543,11 +22066,11 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst @@ -16560,28 +22083,45 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst ret i64 %new } +define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic @@ -16594,11 +22134,11 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic @@ -16611,28 +22151,45 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } +define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic @@ -16645,11 +22202,11 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic @@ -16662,28 +22219,45 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } +define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic + ret i64 %new +} + define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic @@ -16696,11 +22270,11 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic @@ -16713,28 +22287,45 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic ret i64 %new } +define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire @@ -16747,11 +22338,11 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire @@ -16764,28 +22355,45 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } +define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire @@ -16798,11 +22406,11 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire @@ -16815,28 +22423,45 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } +define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + ret i64 %new +} + define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_acquire_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire @@ -16849,11 +22474,11 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire @@ -16866,28 +22491,45 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire ret i64 %new } +define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_generic( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst @@ -16900,11 +22542,11 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst @@ -16917,28 +22559,45 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } +define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst @@ -16951,11 +22610,11 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst @@ -16968,28 +22627,45 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst ret i64 %new } +define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_shared( +; SM60: { +; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-EMPTY: +; SM60-NEXT: // %bb.0: +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} + define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { ; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst @@ -17002,11 +22678,11 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst @@ -17019,11 +22695,11 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst From bbd7015370dcde365c94975c752b70d3177b5c6f Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 20:19:30 +0000 Subject: [PATCH 14/26] Replace old F_ATOMIC_3 completely with a single-opcode variation --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 143 +++++++++++------------ 1 file changed, 69 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f51f0b2343ecf..d0af44e392f12 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -41,6 +41,45 @@ def AS_match { }]; } + +//===----------------------------------------------------------------------===// +// NVPTX Scope Constants +// These map to the Scope enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Scope_thread : PatLeaf<(i32 0)>; // Thread = 0 +def Scope_cta : PatLeaf<(i32 1)>; // Block = 1 +def Scope_cluster : PatLeaf<(i32 2)>; // Cluster = 2 +def Scope_device : PatLeaf<(i32 3)>; // Device = 3 +def Scope_sys : PatLeaf<(i32 4)>; // System = 4 + +//===----------------------------------------------------------------------===// +// NVPTX Address Space Constants +// These map to the AddressSpace enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def AddrSpace_gen : PatLeaf<(i32 0)>; // Generic = 0 +def AddrSpace_global : PatLeaf<(i32 1)>; // Global = 1 +def AddrSpace_shared : PatLeaf<(i32 3)>; // Shared = 3 +def AddrSpace_const : PatLeaf<(i32 4)>; // Const = 4 +def AddrSpace_local : PatLeaf<(i32 5)>; // Local = 5 +def AddrSpace_shared_cluster : PatLeaf<(i32 7)>; // SharedCluster = 7 +def AddrSpace_param : PatLeaf<(i32 101)>; // Param = 101 + +//===----------------------------------------------------------------------===// +// NVPTX Ordering Constants +// These map to the Ordering enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Ordering_not_atomic : PatLeaf<(i32 0)>; // NotAtomic = 0 +def Ordering_relaxed : PatLeaf<(i32 2)>; // Relaxed = 1 +def Ordering_acquire : PatLeaf<(i32 4)>; // Acquire = 4 +def Ordering_release : PatLeaf<(i32 5)>; // Release = 5 +def Ordering_acquire_release : PatLeaf<(i32 6)>; // AcquireRelease = 6 +def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsistent = 7 +def Ordering_volatile : PatLeaf<(i32 8)>; // Volatile = 8 +def Ordering_relaxed_mmio : PatLeaf<(i32 9)>; // RelaxedMMIO = 9 + multiclass nvvm_ternary_atomic_op_scoped { defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val); def NAME#_cta: PatFrag preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str; - let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, - Requires; - - def ir : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, - Requires; - - def ri : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, - Requires; - - def ii : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, - Requires; - } -} - -multiclass F_ATOMIC_3_MANYOPERAND { - defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str # " \t$dst, [$addr], $b, $c;"; +multiclass F_ATOMIC_3 { + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def _rr : NVPTXInst<(outs t.RC:$dst), + def _rr : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ir : NVPTXInst<(outs t.RC:$dst), + def _ir : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ri : NVPTXInst<(outs t.RC:$dst), + def _ri : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ii : NVPTXInst<(outs t.RC:$dst), + def _ii : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; } } -multiclass F_ATOMIC_3_MANYOPERAND_PATTERN { +multiclass F_ATOMIC_3_PATTERN { defvar GetSem = SDNodeXForm(N)), SDLoc(N)); }]>; @@ -1929,14 +1937,6 @@ multiclass F_ATOMIC_2_AS, preds>; } -multiclass F_ATOMIC_3_AS preds = []> { - defvar frag_pat = (frag node:$a, node:$b, node:$c); - defm _G : F_ATOMIC_3, preds>; - defm _S : F_ATOMIC_3, preds>; - defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; - defm _GEN : F_ATOMIC_3, preds>; -} - // atom_add defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS; defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS; @@ -1981,17 +1981,14 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS("atomic_cmp_swap_i"#t.Size); defm INT_PTX_ATOM_CAS_#t.Size - : F_ATOMIC_3_MANYOPERAND; + : F_ATOMIC_3; - defm INT_PTX_ATOM_CAS_PAT_#t.Size : F_ATOMIC_3_MANYOPERAND_PATTERN; + defm INT_PTX_ATOM_CAS_PAT_#t.Size : F_ATOMIC_3_PATTERN; } -// Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; - // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} // and converts it into the appropriate instruction. @@ -2015,20 +2012,6 @@ multiclass ATOM2N_impl; } -multiclass ATOM3N_impl Preds> { - defm "" : F_ATOMIC_3( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - preds = Preds>; -} // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl Preds> { - // No need to define ".gpu"-scoped atomics. They do the same thing - // as the regular, non-scoped atomics defined elsewhere. + +multiclass F_ATOMIC_3_INTRINSIC_PATTERN { foreach scope = ["cta", "sys"] in { - // For now we only need variants for generic space pointers. foreach space = ["gen"] in { +<<<<<<< HEAD defm _#scope#space : ATOM3N_impl; +======= + defvar intrinsic = !cast("int_nvvm_atomic_" # OpStr # "_" # space # "_" # IntTypeStr # "_" # scope); + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), + (!cast(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)), + (!cast(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; +>>>>>>> Replace old F_ATOMIC_3 completely with a single-opcode variation } } } @@ -2094,9 +2089,9 @@ multiclass ATOM2_incdec_impl { // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : ATOM3S_impl; - defm _b32 : ATOM3S_impl; - defm _b64 : ATOM3S_impl; + defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; From 2a5d458e012521925a926ffdee7f95d97f76d070 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 21:43:43 +0000 Subject: [PATCH 15/26] cleanup --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 +- llvm/lib/CodeGen/AtomicExpandPass.cpp | 4 +- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +- llvm/lib/Target/ARM/ARMISelLowering.h | 6 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 24 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 23 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +- llvm/lib/Target/PowerPC/PPCISelLowering.h | 11 - llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 6 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 4140 +++++++------- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 5580 +++++++++---------- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 280 +- 15 files changed, 5041 insertions(+), 5073 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d11e2ca22b189..ad8299ffd41ec 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2327,13 +2327,11 @@ class LLVM_ABI TargetLoweringBase { /// @{ virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const; + AtomicOrdering Ord) const; virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const; + AtomicOrdering Ord) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index bc400b28d26af..0e24b3d65ee2e 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -453,9 +453,9 @@ bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, SyncScope::ID SSID) { ReplacementIRBuilder Builder(I, *DL); - auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); - auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); // We have a guard here because not every atomic operation generates a // trailing fence. if (TrailingFence) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 6c4a480b5ca87..0a077b7b61437 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2383,20 +2383,18 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI, Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) - return Builder.CreateFence(Ord, SSID); + return Builder.CreateFence(Ord); else return nullptr; } Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (isAcquireOrStronger(Ord)) - return Builder.CreateFence(Ord, SSID); + return Builder.CreateFence(Ord); else return nullptr; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1a409c3165f49..05ca11cfac5cb 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21190,8 +21190,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -21216,8 +21215,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 79926386cde1e..8ee009c7b2e39 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -675,11 +675,9 @@ class VectorType; emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; Instruction *emitLeadingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; Instruction *emitTrailingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override; + IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 0747240a4041c..8d6fc11a4f04d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6266,8 +6266,7 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (!isa(Inst)) return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); @@ -6285,8 +6284,7 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { // Specialize for cmpxchg if (!isa(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index dcdebb81e3c86..3b3d699099b06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1602,8 +1602,8 @@ def ADDR : Operand { let MIOperandInfo = (ops ADDR_base, i32imm); } -def LdStCode : Operand { - let PrintMethod = "printLdStCode"; +def AtomicCode : Operand { + let PrintMethod = "printAtomicCode"; } def MmaCode : Operand { @@ -1948,7 +1948,7 @@ defm ProxyRegB64 : ProxyRegInst<"b64", B64>; class LD : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; @@ -1964,7 +1964,7 @@ class ST : NVPTXInst< (outs), (ins O:$src, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth" " \t[$addr], $src;", []>; @@ -1982,21 +1982,21 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v4 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; if support_v8 then def _v8 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " @@ -2013,14 +2013,14 @@ multiclass ST_VEC { def _v2 : NVPTXInst< (outs), (ins O:$src1, O:$src2, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v4 : NVPTXInst< (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; @@ -2029,7 +2029,7 @@ multiclass ST_VEC { (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, O:$src5, O:$src6, O:$src7, O:$src8, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth " "\t[$addr], " diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d0af44e392f12..7faa13a8571d6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1882,22 +1882,22 @@ multiclass F_ATOMIC_2 { - defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str # "\t$dst, [$addr], $b, $c;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def _rr : BasicNVPTXInst<(outs t.RC:$dst), + def _rr : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ir : BasicNVPTXInst<(outs t.RC:$dst), + def _ir : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ri : BasicNVPTXInst<(outs t.RC:$dst), + def _ri : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ii : BasicNVPTXInst<(outs t.RC:$dst), + def _ii : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; } @@ -2030,10 +2030,6 @@ multiclass ATOM2S_impl { foreach scope = ["cta", "sys"] in { foreach space = ["gen"] in { -<<<<<<< HEAD - defm _#scope#space : ATOM3N_impl; -======= defvar intrinsic = !cast("int_nvvm_atomic_" # OpStr # "_" # space # "_" # IntTypeStr # "_" # scope); def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), (!cast(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; @@ -2046,7 +2042,6 @@ multiclass F_ATOMIC_3_INTRINSIC_PATTERN(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; ->>>>>>> Replace old F_ATOMIC_3 completely with a single-opcode variation } } } @@ -2157,7 +2152,7 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>; // during the lifetime of the kernel. class LDG_G - : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>; def LD_GLOBAL_NC_i8 : LDG_G; @@ -2170,19 +2165,19 @@ def LD_GLOBAL_NC_i64 : LDG_G; // Elementized vector ldg class VLDG_G_ELE_V2 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; class VLDG_G_ELE_V4 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; class VLDG_G_ELE_V8 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 7e2bd684a3e06..b96505816dee8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12820,8 +12820,7 @@ Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder, // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -12831,8 +12830,7 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 8f1793ac1136f..4c88bd372b106 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -927,7 +927,6 @@ namespace llvm { return true; } -<<<<<<< HEAD Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override; @@ -938,16 +937,6 @@ namespace llvm { AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; -======= - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; ->>>>>>> [NVPTX] Add syncscope support for cmpxchg bool shouldInlineQuadwordAtomics() const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a08b4aac24e06..35fbac04b3405 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23306,8 +23306,7 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint( Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); @@ -23323,8 +23322,7 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder, Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID) const { + AtomicOrdering Ord) const { if (Subtarget.hasStdExtZtso()) { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 05ea2e5759f80..08993c454e201 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -234,12 +234,10 @@ class RISCVTargetLowering : public TargetLowering { Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + AtomicOrdering Ord) const override; Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord, - SyncScope::ID SSID = SyncScope::System) const override; + AtomicOrdering Ord) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index bae9520da7905..ddedc7ea36252 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -416,8 +416,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -462,8 +462,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -474,9 +474,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -508,8 +508,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -520,9 +520,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -554,8 +554,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -566,9 +566,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -600,8 +600,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -612,9 +612,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -646,8 +646,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -658,9 +658,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -692,8 +692,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -704,9 +704,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -738,8 +738,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -750,9 +750,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -784,8 +784,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -796,9 +796,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -830,8 +830,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -843,9 +843,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -877,8 +877,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -890,9 +890,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -924,8 +924,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -937,9 +937,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -971,8 +971,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -984,9 +984,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1018,8 +1018,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1031,9 +1031,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1065,8 +1065,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1078,9 +1078,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1112,8 +1112,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1125,9 +1125,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1159,8 +1159,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1172,9 +1172,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1206,8 +1206,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1219,9 +1219,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1253,8 +1253,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1265,9 +1265,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1299,8 +1299,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1311,9 +1311,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1345,8 +1345,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1357,9 +1357,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1391,8 +1391,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1403,9 +1403,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1437,8 +1437,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1449,9 +1449,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1483,8 +1483,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1495,9 +1495,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1529,8 +1529,8 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1541,9 +1541,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1575,8 +1575,8 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1587,9 +1587,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1621,8 +1621,8 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1633,9 +1633,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1667,8 +1667,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1679,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1713,8 +1713,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1759,8 +1759,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1805,8 +1805,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1851,8 +1851,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1863,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1897,8 +1897,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1943,8 +1943,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1989,8 +1989,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2035,8 +2035,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2081,8 +2081,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2094,9 +2094,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2128,8 +2128,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2141,9 +2141,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2175,8 +2175,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2188,9 +2188,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2222,8 +2222,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2235,9 +2235,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2269,8 +2269,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2282,9 +2282,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2316,8 +2316,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2329,9 +2329,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2363,8 +2363,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2376,9 +2376,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2410,8 +2410,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2423,9 +2423,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2457,8 +2457,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2470,9 +2470,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2504,8 +2504,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2517,9 +2517,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2550,8 +2550,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2563,9 +2563,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2596,8 +2596,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2609,9 +2609,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2642,8 +2642,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2655,9 +2655,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2688,8 +2688,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2701,9 +2701,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2734,8 +2734,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2747,9 +2747,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2780,8 +2780,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2793,9 +2793,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2826,8 +2826,8 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2839,9 +2839,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2872,8 +2872,8 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2885,9 +2885,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2918,8 +2918,8 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2931,9 +2931,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2965,8 +2965,8 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2978,9 +2978,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3012,8 +3012,8 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3025,9 +3025,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3059,8 +3059,8 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3072,9 +3072,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3106,8 +3106,8 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3119,9 +3119,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3153,8 +3153,8 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3166,9 +3166,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3200,8 +3200,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3213,9 +3213,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3247,8 +3247,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3260,9 +3260,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3294,8 +3294,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3307,9 +3307,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3341,8 +3341,8 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3354,9 +3354,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3388,8 +3388,8 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3401,9 +3401,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3435,8 +3435,8 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3448,9 +3448,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3482,8 +3482,8 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3495,9 +3495,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3529,8 +3529,8 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3542,9 +3542,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3576,8 +3576,8 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3589,9 +3589,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3623,8 +3623,8 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3636,9 +3636,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3670,8 +3670,8 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3683,9 +3683,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3717,8 +3717,8 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3730,9 +3730,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3764,8 +3764,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3777,9 +3777,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3811,8 +3811,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3824,9 +3824,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3858,8 +3858,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3871,9 +3871,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3905,8 +3905,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3918,9 +3918,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3952,8 +3952,8 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3965,9 +3965,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3999,8 +3999,8 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4012,9 +4012,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4046,8 +4046,8 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4059,9 +4059,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4093,8 +4093,8 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4106,9 +4106,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4140,8 +4140,8 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4153,9 +4153,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4187,8 +4187,8 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4200,9 +4200,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4215,7 +4215,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB90_1; ; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4234,8 +4234,8 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4247,9 +4247,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4262,7 +4262,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB91_1; ; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4281,8 +4281,8 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4294,9 +4294,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4309,7 +4309,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB92_1; ; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4328,8 +4328,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4341,9 +4341,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4356,7 +4356,7 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB93_1; ; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4375,8 +4375,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4388,9 +4388,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4403,7 +4403,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB94_1; ; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4422,8 +4422,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4435,9 +4435,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4450,7 +4450,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB95_1; ; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4469,8 +4469,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4482,9 +4482,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4497,7 +4497,7 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB96_1; ; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4516,8 +4516,8 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4529,9 +4529,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4544,7 +4544,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB97_1; ; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4563,8 +4563,8 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4576,9 +4576,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4591,7 +4591,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB98_1; ; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4610,8 +4610,8 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4623,9 +4623,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4638,7 +4638,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB99_1; ; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4657,8 +4657,8 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4670,9 +4670,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4685,7 +4685,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB100_1; ; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4704,8 +4704,8 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4717,9 +4717,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4732,7 +4732,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB101_1; ; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4751,8 +4751,8 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4764,9 +4764,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4779,7 +4779,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB102_1; ; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4798,8 +4798,8 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4811,9 +4811,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4826,7 +4826,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB103_1; ; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4845,8 +4845,8 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4858,9 +4858,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4873,7 +4873,7 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB104_1; ; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4892,8 +4892,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4905,9 +4905,9 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4920,7 +4920,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB105_1; ; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4939,8 +4939,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4952,9 +4952,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4967,7 +4967,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB106_1; ; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4986,8 +4986,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4999,9 +4999,9 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5014,7 +5014,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB107_1; ; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5033,8 +5033,8 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5046,9 +5046,9 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5061,7 +5061,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB108_1; ; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5080,8 +5080,8 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5093,9 +5093,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5108,7 +5108,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB109_1; ; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5127,8 +5127,8 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5140,9 +5140,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5155,7 +5155,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB110_1; ; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5174,8 +5174,8 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5187,9 +5187,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5202,7 +5202,7 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB111_1; ; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5221,8 +5221,8 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5234,9 +5234,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5249,7 +5249,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB112_1; ; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5268,8 +5268,8 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5281,9 +5281,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5296,7 +5296,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB113_1; ; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5315,8 +5315,8 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5328,9 +5328,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5343,7 +5343,7 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB114_1; ; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5362,8 +5362,8 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5375,9 +5375,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5390,7 +5390,7 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB115_1; ; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5409,8 +5409,8 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5422,9 +5422,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5437,7 +5437,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB116_1; ; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5456,8 +5456,8 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5469,9 +5469,9 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5484,7 +5484,7 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB117_1; ; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5503,8 +5503,8 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5516,9 +5516,9 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5531,7 +5531,7 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB118_1; ; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5550,8 +5550,8 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5563,9 +5563,9 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5578,7 +5578,7 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB119_1; ; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5597,8 +5597,8 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5610,9 +5610,9 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5625,7 +5625,7 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB120_1; ; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5644,8 +5644,8 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5657,9 +5657,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5672,7 +5672,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB121_1; ; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5691,8 +5691,8 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5704,9 +5704,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5719,7 +5719,7 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB122_1; ; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5738,8 +5738,8 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5751,9 +5751,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5766,7 +5766,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB123_1; ; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5785,8 +5785,8 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5798,9 +5798,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5813,7 +5813,7 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB124_1; ; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5832,8 +5832,8 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5845,9 +5845,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5860,7 +5860,7 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB125_1; ; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -5879,8 +5879,8 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5892,9 +5892,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5907,7 +5907,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB126_1; ; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -5926,8 +5926,8 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5939,9 +5939,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5954,7 +5954,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB127_1; ; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -5973,8 +5973,8 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5986,9 +5986,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6001,7 +6001,7 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB128_1; ; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -6020,8 +6020,8 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6033,9 +6033,9 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6048,7 +6048,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB129_1; ; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -6067,8 +6067,8 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6080,9 +6080,9 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6095,7 +6095,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB130_1; ; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -6114,8 +6114,8 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6127,9 +6127,9 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6142,7 +6142,7 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB131_1; ; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -6161,8 +6161,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6174,9 +6174,9 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6189,7 +6189,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB132_1; ; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -6208,8 +6208,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6221,9 +6221,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6236,7 +6236,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB133_1; ; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -6255,8 +6255,8 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6268,9 +6268,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6283,7 +6283,7 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM70-NEXT: mov.u32 %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB134_1; ; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -6302,10 +6302,10 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6315,7 +6315,7 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6328,7 +6328,7 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB135_1; ; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6346,10 +6346,10 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6359,7 +6359,7 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6372,7 +6372,7 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB136_1; ; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6390,10 +6390,10 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6403,7 +6403,7 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6416,7 +6416,7 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB137_1; ; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6434,10 +6434,10 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6447,7 +6447,7 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6460,7 +6460,7 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB138_1; ; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6478,10 +6478,10 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6491,7 +6491,7 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6504,7 +6504,7 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB139_1; ; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6522,10 +6522,10 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6535,7 +6535,7 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6548,7 +6548,7 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB140_1; ; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6566,10 +6566,10 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6579,7 +6579,7 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6592,7 +6592,7 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB141_1; ; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6610,10 +6610,10 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6623,7 +6623,7 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6636,7 +6636,7 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB142_1; ; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6654,10 +6654,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6667,7 +6667,7 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6680,7 +6680,7 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB143_1; ; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -6698,10 +6698,10 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6711,7 +6711,7 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6724,7 +6724,7 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB144_1; ; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -6743,10 +6743,10 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6756,7 +6756,7 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6769,7 +6769,7 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB145_1; ; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -6788,10 +6788,10 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6801,7 +6801,7 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6814,7 +6814,7 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB146_1; ; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -6833,10 +6833,10 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6846,7 +6846,7 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6859,7 +6859,7 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB147_1; ; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -6878,10 +6878,10 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6891,7 +6891,7 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6904,7 +6904,7 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB148_1; ; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -6923,10 +6923,10 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6936,7 +6936,7 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6949,7 +6949,7 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB149_1; ; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -6968,10 +6968,10 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -6981,7 +6981,7 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6994,7 +6994,7 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB150_1; ; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7013,10 +7013,10 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7026,7 +7026,7 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7039,7 +7039,7 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB151_1; ; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7058,10 +7058,10 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7071,7 +7071,7 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7084,7 +7084,7 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB152_1; ; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7103,10 +7103,10 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7117,7 +7117,7 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7130,7 +7130,7 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB153_1; ; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7149,10 +7149,10 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7163,7 +7163,7 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7176,7 +7176,7 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB154_1; ; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7195,10 +7195,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7209,7 +7209,7 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7222,7 +7222,7 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB155_1; ; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7241,10 +7241,10 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7255,7 +7255,7 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7268,7 +7268,7 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB156_1; ; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7287,10 +7287,10 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7301,7 +7301,7 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7314,7 +7314,7 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB157_1; ; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7333,10 +7333,10 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7347,7 +7347,7 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7360,7 +7360,7 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB158_1; ; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7379,10 +7379,10 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7393,7 +7393,7 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7406,7 +7406,7 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB159_1; ; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7425,10 +7425,10 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7439,7 +7439,7 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7452,7 +7452,7 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB160_1; ; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7471,10 +7471,10 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -7485,7 +7485,7 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7498,7 +7498,7 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB161_1; ; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7517,10 +7517,10 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7530,7 +7530,7 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7543,7 +7543,7 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB162_1; ; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7562,10 +7562,10 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7575,7 +7575,7 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7588,7 +7588,7 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB163_1; ; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7607,10 +7607,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7620,7 +7620,7 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7633,7 +7633,7 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB164_1; ; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7652,10 +7652,10 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7665,7 +7665,7 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7678,7 +7678,7 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB165_1; ; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7697,10 +7697,10 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7710,7 +7710,7 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7723,7 +7723,7 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB166_1; ; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7742,10 +7742,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7755,7 +7755,7 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7768,7 +7768,7 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB167_1; ; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7787,10 +7787,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7800,7 +7800,7 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7813,7 +7813,7 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB168_1; ; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7832,10 +7832,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7845,7 +7845,7 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7858,7 +7858,7 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB169_1; ; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -7877,10 +7877,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7890,7 +7890,7 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7903,7 +7903,7 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB170_1; ; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -7922,10 +7922,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7935,7 +7935,7 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7948,7 +7948,7 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB171_1; ; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -7967,10 +7967,10 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -7980,7 +7980,7 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7993,7 +7993,7 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB172_1; ; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8012,10 +8012,10 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8025,7 +8025,7 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8038,7 +8038,7 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB173_1; ; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8057,10 +8057,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8070,7 +8070,7 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8083,7 +8083,7 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB174_1; ; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -8102,10 +8102,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8115,7 +8115,7 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8128,7 +8128,7 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB175_1; ; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8147,10 +8147,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8160,7 +8160,7 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8173,7 +8173,7 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB176_1; ; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8192,10 +8192,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8205,7 +8205,7 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8218,7 +8218,7 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB177_1; ; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -8237,10 +8237,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8250,7 +8250,7 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8263,7 +8263,7 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB178_1; ; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8282,10 +8282,10 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -8295,7 +8295,7 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8308,7 +8308,7 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB179_1; ; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8327,10 +8327,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8341,7 +8341,7 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8354,7 +8354,7 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB180_1; ; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -8373,10 +8373,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8387,7 +8387,7 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8400,7 +8400,7 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB181_1; ; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8419,10 +8419,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8433,7 +8433,7 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8446,7 +8446,7 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB182_1; ; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8465,10 +8465,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8479,7 +8479,7 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8492,7 +8492,7 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB183_1; ; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -8511,10 +8511,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8525,7 +8525,7 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8538,7 +8538,7 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB184_1; ; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8557,10 +8557,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8571,7 +8571,7 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8584,7 +8584,7 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB185_1; ; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8603,10 +8603,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8617,7 +8617,7 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8630,7 +8630,7 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB186_1; ; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -8649,10 +8649,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8663,7 +8663,7 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8676,7 +8676,7 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB187_1; ; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -8695,10 +8695,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8709,7 +8709,7 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8722,7 +8722,7 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB188_1; ; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -8741,10 +8741,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8755,7 +8755,7 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8768,7 +8768,7 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB189_1; ; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -8786,10 +8786,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8800,7 +8800,7 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8813,7 +8813,7 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB190_1; ; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -8831,10 +8831,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8845,7 +8845,7 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8858,7 +8858,7 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB191_1; ; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -8876,10 +8876,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8890,7 +8890,7 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8903,7 +8903,7 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB192_1; ; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -8921,10 +8921,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8935,7 +8935,7 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8948,7 +8948,7 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB193_1; ; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -8966,10 +8966,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -8980,7 +8980,7 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8993,7 +8993,7 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB194_1; ; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -9011,10 +9011,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9025,7 +9025,7 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9038,7 +9038,7 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB195_1; ; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -9056,10 +9056,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9070,7 +9070,7 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9083,7 +9083,7 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB196_1; ; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -9101,10 +9101,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9115,7 +9115,7 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9128,7 +9128,7 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB197_1; ; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -9146,10 +9146,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9160,7 +9160,7 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9173,7 +9173,7 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB198_1; ; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9192,10 +9192,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9206,7 +9206,7 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9219,7 +9219,7 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB199_1; ; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9238,10 +9238,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9252,7 +9252,7 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9265,7 +9265,7 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB200_1; ; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9284,10 +9284,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9298,7 +9298,7 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9311,7 +9311,7 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB201_1; ; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9330,10 +9330,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9344,7 +9344,7 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9357,7 +9357,7 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB202_1; ; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9376,10 +9376,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9390,7 +9390,7 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9403,7 +9403,7 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB203_1; ; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9422,10 +9422,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9436,7 +9436,7 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9449,7 +9449,7 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB204_1; ; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9468,10 +9468,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9482,7 +9482,7 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9495,7 +9495,7 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB205_1; ; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9514,10 +9514,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9528,7 +9528,7 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9541,7 +9541,7 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB206_1; ; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9560,10 +9560,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9574,7 +9574,7 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9587,7 +9587,7 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB207_1; ; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9606,10 +9606,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9620,7 +9620,7 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9633,7 +9633,7 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB208_1; ; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9652,10 +9652,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9666,7 +9666,7 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9679,7 +9679,7 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB209_1; ; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9698,10 +9698,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9712,7 +9712,7 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9725,7 +9725,7 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB210_1; ; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9744,10 +9744,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9758,7 +9758,7 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9771,7 +9771,7 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB211_1; ; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9790,10 +9790,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9804,7 +9804,7 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9817,7 +9817,7 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB212_1; ; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9836,10 +9836,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9850,7 +9850,7 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9863,7 +9863,7 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB213_1; ; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -9882,10 +9882,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9896,7 +9896,7 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9909,7 +9909,7 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB214_1; ; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -9928,10 +9928,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9942,7 +9942,7 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9955,7 +9955,7 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB215_1; ; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -9974,10 +9974,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -9988,7 +9988,7 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10001,7 +10001,7 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB216_1; ; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10020,10 +10020,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10034,7 +10034,7 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10047,7 +10047,7 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB217_1; ; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10066,10 +10066,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10080,7 +10080,7 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10093,7 +10093,7 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB218_1; ; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10112,10 +10112,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10126,7 +10126,7 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10139,7 +10139,7 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB219_1; ; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10158,10 +10158,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10172,7 +10172,7 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10185,7 +10185,7 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB220_1; ; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10204,10 +10204,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10218,7 +10218,7 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10231,7 +10231,7 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB221_1; ; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10250,10 +10250,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10264,7 +10264,7 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10277,7 +10277,7 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB222_1; ; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10296,10 +10296,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10310,7 +10310,7 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10323,7 +10323,7 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB223_1; ; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10342,10 +10342,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10356,7 +10356,7 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10369,7 +10369,7 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB224_1; ; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10388,10 +10388,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10402,7 +10402,7 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10415,7 +10415,7 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB225_1; ; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10434,10 +10434,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10448,7 +10448,7 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10461,7 +10461,7 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB226_1; ; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10480,10 +10480,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10494,7 +10494,7 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10507,7 +10507,7 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB227_1; ; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10526,10 +10526,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10540,7 +10540,7 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10553,7 +10553,7 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB228_1; ; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10572,10 +10572,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10586,7 +10586,7 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10599,7 +10599,7 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB229_1; ; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10618,10 +10618,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10632,7 +10632,7 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10645,7 +10645,7 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB230_1; ; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10664,10 +10664,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10678,7 +10678,7 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10691,7 +10691,7 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB231_1; ; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10710,10 +10710,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10724,7 +10724,7 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10737,7 +10737,7 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB232_1; ; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10756,10 +10756,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10770,7 +10770,7 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10783,7 +10783,7 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB233_1; ; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10802,10 +10802,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10816,7 +10816,7 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10829,7 +10829,7 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB234_1; ; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10848,10 +10848,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10862,7 +10862,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10875,7 +10875,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB235_1; ; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -10894,10 +10894,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10908,7 +10908,7 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10921,7 +10921,7 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB236_1; ; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -10940,10 +10940,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -10954,7 +10954,7 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10967,7 +10967,7 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB237_1; ; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -10986,10 +10986,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11000,7 +11000,7 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11013,7 +11013,7 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB238_1; ; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11032,10 +11032,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11046,7 +11046,7 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11059,7 +11059,7 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB239_1; ; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11078,10 +11078,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11092,7 +11092,7 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11105,7 +11105,7 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB240_1; ; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11124,10 +11124,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11138,7 +11138,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11151,7 +11151,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB241_1; ; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11170,10 +11170,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11184,7 +11184,7 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11197,7 +11197,7 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB242_1; ; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11216,10 +11216,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11230,7 +11230,7 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11243,7 +11243,7 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB243_1; ; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11262,10 +11262,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11276,7 +11276,7 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11289,7 +11289,7 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB244_1; ; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11308,10 +11308,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11322,7 +11322,7 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11335,7 +11335,7 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB245_1; ; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11354,10 +11354,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11368,7 +11368,7 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11381,7 +11381,7 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB246_1; ; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11400,10 +11400,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11414,7 +11414,7 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11427,7 +11427,7 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB247_1; ; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11446,10 +11446,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11460,7 +11460,7 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11473,7 +11473,7 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB248_1; ; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11492,10 +11492,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11506,7 +11506,7 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11519,7 +11519,7 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB249_1; ; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11538,10 +11538,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11552,7 +11552,7 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11565,7 +11565,7 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB250_1; ; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11584,10 +11584,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11598,7 +11598,7 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11611,7 +11611,7 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB251_1; ; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11630,10 +11630,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11644,7 +11644,7 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11657,7 +11657,7 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB252_1; ; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11676,10 +11676,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11690,7 +11690,7 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11703,7 +11703,7 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB253_1; ; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11722,10 +11722,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11736,7 +11736,7 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11749,7 +11749,7 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB254_1; ; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11768,10 +11768,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11782,7 +11782,7 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11795,7 +11795,7 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB255_1; ; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11814,10 +11814,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11828,7 +11828,7 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11841,7 +11841,7 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB256_1; ; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11860,10 +11860,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11874,7 +11874,7 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11887,7 +11887,7 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB257_1; ; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -11906,10 +11906,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11920,7 +11920,7 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11933,7 +11933,7 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB258_1; ; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -11952,10 +11952,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -11966,7 +11966,7 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11979,7 +11979,7 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB259_1; ; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -11998,10 +11998,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12012,7 +12012,7 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12025,7 +12025,7 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB260_1; ; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -12044,10 +12044,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12058,7 +12058,7 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12071,7 +12071,7 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB261_1; ; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -12090,10 +12090,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12104,7 +12104,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12117,7 +12117,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB262_1; ; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -12136,10 +12136,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12150,7 +12150,7 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12163,7 +12163,7 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB263_1; ; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -12182,10 +12182,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12196,7 +12196,7 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12209,7 +12209,7 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB264_1; ; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -12228,10 +12228,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12242,7 +12242,7 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12255,7 +12255,7 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB265_1; ; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -12274,10 +12274,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12288,7 +12288,7 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12301,7 +12301,7 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB266_1; ; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -12320,10 +12320,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12334,7 +12334,7 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12347,7 +12347,7 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB267_1; ; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -12366,10 +12366,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12380,7 +12380,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12393,7 +12393,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB268_1; ; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -12412,10 +12412,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -12426,7 +12426,7 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12439,7 +12439,7 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; ; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM70-NEXT: mov.u32 %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB269_1; ; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -12456,9 +12456,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12473,9 +12473,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12490,9 +12490,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12507,9 +12507,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12524,9 +12524,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12541,9 +12541,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12558,9 +12558,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12575,9 +12575,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12592,9 +12592,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12609,9 +12609,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12626,9 +12626,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12643,9 +12643,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12660,9 +12660,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12677,9 +12677,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12694,9 +12694,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12711,9 +12711,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12728,9 +12728,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12745,9 +12745,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12762,10 +12762,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12780,10 +12780,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12798,10 +12798,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12816,10 +12816,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12834,10 +12834,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12852,10 +12852,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12870,10 +12870,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12888,10 +12888,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12906,10 +12906,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12924,9 +12924,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12941,9 +12941,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12958,9 +12958,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12975,9 +12975,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -12992,9 +12992,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13009,9 +13009,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13026,9 +13026,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13043,9 +13043,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13060,9 +13060,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13077,9 +13077,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13094,9 +13094,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13111,9 +13111,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13128,9 +13128,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13145,9 +13145,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13162,9 +13162,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13179,9 +13179,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13196,9 +13196,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13213,9 +13213,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13230,10 +13230,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13248,10 +13248,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13266,10 +13266,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13284,10 +13284,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13302,10 +13302,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13320,10 +13320,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13338,10 +13338,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13356,10 +13356,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13374,10 +13374,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13392,9 +13392,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13409,9 +13409,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13426,9 +13426,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13443,9 +13443,9 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13460,9 +13460,9 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13477,9 +13477,9 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13494,9 +13494,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13511,9 +13511,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13528,9 +13528,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13545,9 +13545,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13562,9 +13562,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13579,9 +13579,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13596,9 +13596,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13613,9 +13613,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13630,9 +13630,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13647,9 +13647,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13664,9 +13664,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13681,9 +13681,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13698,10 +13698,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13716,10 +13716,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13734,10 +13734,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13752,10 +13752,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13770,10 +13770,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13788,10 +13788,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13806,10 +13806,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13824,10 +13824,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13842,10 +13842,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13860,9 +13860,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13877,9 +13877,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13894,9 +13894,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13911,9 +13911,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13928,9 +13928,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13945,9 +13945,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13962,9 +13962,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13979,9 +13979,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -13996,9 +13996,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14013,9 +14013,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14030,9 +14030,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14047,9 +14047,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14064,9 +14064,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14081,9 +14081,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14098,9 +14098,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14115,9 +14115,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14132,9 +14132,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14149,9 +14149,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14166,10 +14166,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14184,10 +14184,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14202,10 +14202,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14220,10 +14220,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14238,10 +14238,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14256,10 +14256,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14274,10 +14274,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14292,10 +14292,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14310,10 +14310,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14328,10 +14328,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14346,10 +14346,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14364,10 +14364,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14382,10 +14382,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14400,10 +14400,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14418,10 +14418,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14436,10 +14436,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14454,10 +14454,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14472,10 +14472,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14490,10 +14490,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14508,10 +14508,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14526,10 +14526,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14544,10 +14544,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14562,10 +14562,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14580,10 +14580,10 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14598,10 +14598,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14616,10 +14616,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14634,10 +14634,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14652,10 +14652,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14670,10 +14670,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14688,10 +14688,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14706,10 +14706,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14724,10 +14724,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14742,10 +14742,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14760,10 +14760,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14778,10 +14778,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14796,10 +14796,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -14813,9 +14813,9 @@ define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14829,9 +14829,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14845,9 +14845,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14861,9 +14861,9 @@ define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14877,9 +14877,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14893,9 +14893,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14909,9 +14909,9 @@ define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14925,9 +14925,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14941,9 +14941,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14957,9 +14957,9 @@ define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14973,9 +14973,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -14989,9 +14989,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15005,9 +15005,9 @@ define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15021,9 +15021,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15037,9 +15037,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15053,9 +15053,9 @@ define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15069,9 +15069,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15085,9 +15085,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15101,10 +15101,10 @@ define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15118,10 +15118,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15135,10 +15135,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15152,10 +15152,10 @@ define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15169,10 +15169,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15186,10 +15186,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15203,10 +15203,10 @@ define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15220,10 +15220,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15237,10 +15237,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15254,9 +15254,9 @@ define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15270,9 +15270,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15286,9 +15286,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15302,9 +15302,9 @@ define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15318,9 +15318,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15334,9 +15334,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15350,9 +15350,9 @@ define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15366,9 +15366,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15382,9 +15382,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15398,9 +15398,9 @@ define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15414,9 +15414,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15430,9 +15430,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15446,9 +15446,9 @@ define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15462,9 +15462,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15478,9 +15478,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15494,9 +15494,9 @@ define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15510,9 +15510,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15526,9 +15526,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15542,10 +15542,10 @@ define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15559,10 +15559,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15576,10 +15576,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15593,10 +15593,10 @@ define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15610,10 +15610,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15627,10 +15627,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15644,10 +15644,10 @@ define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15661,10 +15661,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15678,10 +15678,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15695,9 +15695,9 @@ define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15711,9 +15711,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15727,9 +15727,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15743,9 +15743,9 @@ define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15759,9 +15759,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15775,9 +15775,9 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15791,9 +15791,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15807,9 +15807,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15823,9 +15823,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15839,9 +15839,9 @@ define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15855,9 +15855,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15871,9 +15871,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15887,9 +15887,9 @@ define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15903,9 +15903,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15919,9 +15919,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15935,9 +15935,9 @@ define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15951,9 +15951,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15967,9 +15967,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -15983,10 +15983,10 @@ define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16000,10 +16000,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16017,10 +16017,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16034,10 +16034,10 @@ define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16051,10 +16051,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16068,10 +16068,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16085,10 +16085,10 @@ define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16102,10 +16102,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16119,10 +16119,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16136,9 +16136,9 @@ define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16152,9 +16152,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16168,9 +16168,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16184,9 +16184,9 @@ define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16200,9 +16200,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16216,9 +16216,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16232,9 +16232,9 @@ define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16248,9 +16248,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16264,9 +16264,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16280,9 +16280,9 @@ define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16296,9 +16296,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16312,9 +16312,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16328,9 +16328,9 @@ define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16344,9 +16344,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16360,9 +16360,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16376,9 +16376,9 @@ define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16392,9 +16392,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16408,9 +16408,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16424,10 +16424,10 @@ define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16441,10 +16441,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16458,10 +16458,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16475,10 +16475,10 @@ define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16492,10 +16492,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16509,10 +16509,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16526,10 +16526,10 @@ define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16543,10 +16543,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16560,10 +16560,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16577,10 +16577,10 @@ define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16594,10 +16594,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16611,10 +16611,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16628,10 +16628,10 @@ define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16645,10 +16645,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16662,10 +16662,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16679,10 +16679,10 @@ define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16696,10 +16696,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16713,10 +16713,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16730,10 +16730,10 @@ define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16747,10 +16747,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16764,10 +16764,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16781,10 +16781,10 @@ define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16798,10 +16798,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16815,10 +16815,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16832,10 +16832,10 @@ define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16849,10 +16849,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16866,10 +16866,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16883,10 +16883,10 @@ define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16900,10 +16900,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16917,10 +16917,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16934,10 +16934,10 @@ define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16951,10 +16951,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16968,10 +16968,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -16985,10 +16985,10 @@ define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -17002,10 +17002,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -17019,10 +17019,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index f2ceda7951d45..68658255ad5af 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -191,8 +191,8 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -203,9 +203,9 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -236,8 +236,8 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -248,9 +248,9 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -281,8 +281,8 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -293,9 +293,9 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -326,8 +326,8 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -338,9 +338,9 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -371,8 +371,8 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -383,9 +383,9 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -416,8 +416,8 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -428,9 +428,9 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -461,8 +461,8 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -473,9 +473,9 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -506,8 +506,8 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -518,9 +518,9 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -551,8 +551,8 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -563,9 +563,9 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -597,8 +597,8 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -609,9 +609,9 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -643,8 +643,8 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -655,9 +655,9 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -689,8 +689,8 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -701,9 +701,9 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -735,8 +735,8 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -747,9 +747,9 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -781,8 +781,8 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -793,9 +793,9 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -827,8 +827,8 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -839,9 +839,9 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -873,8 +873,8 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -885,9 +885,9 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -919,8 +919,8 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -931,9 +931,9 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -965,8 +965,8 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -977,9 +977,9 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1011,8 +1011,8 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1023,9 +1023,9 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1057,8 +1057,8 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1069,9 +1069,9 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1103,8 +1103,8 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1116,9 +1116,9 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1150,8 +1150,8 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1163,9 +1163,9 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1197,8 +1197,8 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1210,9 +1210,9 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1244,8 +1244,8 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1257,9 +1257,9 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1291,8 +1291,8 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1304,9 +1304,9 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1338,8 +1338,8 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1351,9 +1351,9 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1385,8 +1385,8 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1398,9 +1398,9 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1432,8 +1432,8 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1445,9 +1445,9 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1479,8 +1479,8 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1492,9 +1492,9 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1526,8 +1526,8 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1539,9 +1539,9 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1573,8 +1573,8 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1586,9 +1586,9 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1620,8 +1620,8 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1633,9 +1633,9 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1667,8 +1667,8 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1679,9 +1679,9 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1713,8 +1713,8 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1725,9 +1725,9 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1759,8 +1759,8 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1771,9 +1771,9 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1805,8 +1805,8 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1817,9 +1817,9 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1851,8 +1851,8 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1863,9 +1863,9 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1897,8 +1897,8 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1909,9 +1909,9 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1943,8 +1943,8 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -1955,9 +1955,9 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1989,8 +1989,8 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2001,9 +2001,9 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2035,8 +2035,8 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2047,9 +2047,9 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2081,8 +2081,8 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2093,9 +2093,9 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2127,8 +2127,8 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2139,9 +2139,9 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2173,8 +2173,8 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2185,9 +2185,9 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2219,8 +2219,8 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2231,9 +2231,9 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2265,8 +2265,8 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2277,9 +2277,9 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2311,8 +2311,8 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2323,9 +2323,9 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2357,8 +2357,8 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2369,9 +2369,9 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2403,8 +2403,8 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2415,9 +2415,9 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2449,8 +2449,8 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2461,9 +2461,9 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2495,8 +2495,8 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2507,9 +2507,9 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2541,8 +2541,8 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2553,9 +2553,9 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2587,8 +2587,8 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2599,9 +2599,9 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2633,8 +2633,8 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2645,9 +2645,9 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2679,8 +2679,8 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2691,9 +2691,9 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2725,8 +2725,8 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -2737,9 +2737,9 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2771,8 +2771,8 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2784,9 +2784,9 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2818,8 +2818,8 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2831,9 +2831,9 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2865,8 +2865,8 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2878,9 +2878,9 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2912,8 +2912,8 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2925,9 +2925,9 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2959,8 +2959,8 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2972,9 +2972,9 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3006,8 +3006,8 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3019,9 +3019,9 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3053,8 +3053,8 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3066,9 +3066,9 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3100,8 +3100,8 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3113,9 +3113,9 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3147,8 +3147,8 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3160,9 +3160,9 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3194,8 +3194,8 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3207,9 +3207,9 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3241,8 +3241,8 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3254,9 +3254,9 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3288,8 +3288,8 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3301,9 +3301,9 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3335,8 +3335,8 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3348,9 +3348,9 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3381,8 +3381,8 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3394,9 +3394,9 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3427,8 +3427,8 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3440,9 +3440,9 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3473,8 +3473,8 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3486,9 +3486,9 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3519,8 +3519,8 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3532,9 +3532,9 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3565,8 +3565,8 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3578,9 +3578,9 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3611,8 +3611,8 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3624,9 +3624,9 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3657,8 +3657,8 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3670,9 +3670,9 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3703,8 +3703,8 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3716,9 +3716,9 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3749,8 +3749,8 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3762,9 +3762,9 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3795,8 +3795,8 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3808,9 +3808,9 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3841,8 +3841,8 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3854,9 +3854,9 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3887,8 +3887,8 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3900,9 +3900,9 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3934,8 +3934,8 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3947,9 +3947,9 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3981,8 +3981,8 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -3994,9 +3994,9 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4028,8 +4028,8 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4041,9 +4041,9 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4075,8 +4075,8 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4088,9 +4088,9 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4122,8 +4122,8 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4135,9 +4135,9 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4169,8 +4169,8 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4182,9 +4182,9 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4197,7 +4197,7 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB90_1; ; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4216,8 +4216,8 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4229,9 +4229,9 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4244,7 +4244,7 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB91_1; ; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -4263,8 +4263,8 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4276,9 +4276,9 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4291,7 +4291,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB92_1; ; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -4310,8 +4310,8 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4323,9 +4323,9 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4338,7 +4338,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB93_1; ; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4357,8 +4357,8 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4370,9 +4370,9 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4385,7 +4385,7 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB94_1; ; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4404,8 +4404,8 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4417,9 +4417,9 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4432,7 +4432,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB95_1; ; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -4451,8 +4451,8 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4464,9 +4464,9 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4479,7 +4479,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB96_1; ; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -4498,8 +4498,8 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4511,9 +4511,9 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4526,7 +4526,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB97_1; ; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4545,8 +4545,8 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4558,9 +4558,9 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4573,7 +4573,7 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB98_1; ; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4592,8 +4592,8 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4605,9 +4605,9 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4620,7 +4620,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB99_1; ; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -4639,8 +4639,8 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4652,9 +4652,9 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4667,7 +4667,7 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB100_1; ; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -4686,8 +4686,8 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4699,9 +4699,9 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4714,7 +4714,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB101_1; ; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4733,8 +4733,8 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4746,9 +4746,9 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4761,7 +4761,7 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB102_1; ; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4780,8 +4780,8 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4793,9 +4793,9 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4808,7 +4808,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB103_1; ; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -4827,8 +4827,8 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4840,9 +4840,9 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4855,7 +4855,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB104_1; ; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -4874,8 +4874,8 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4887,9 +4887,9 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4902,7 +4902,7 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB105_1; ; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4921,8 +4921,8 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4934,9 +4934,9 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4949,7 +4949,7 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB106_1; ; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4968,8 +4968,8 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -4981,9 +4981,9 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4996,7 +4996,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB107_1; ; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5015,8 +5015,8 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5028,9 +5028,9 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5043,7 +5043,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB108_1; ; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -5062,8 +5062,8 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5075,9 +5075,9 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5090,7 +5090,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB109_1; ; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -5109,8 +5109,8 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5122,9 +5122,9 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5137,7 +5137,7 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB110_1; ; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -5156,8 +5156,8 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5169,9 +5169,9 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5184,7 +5184,7 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB111_1; ; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5203,8 +5203,8 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5216,9 +5216,9 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5231,7 +5231,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB112_1; ; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -5250,8 +5250,8 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5263,9 +5263,9 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5278,7 +5278,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB113_1; ; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -5297,8 +5297,8 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5310,9 +5310,9 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5325,7 +5325,7 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB114_1; ; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -5344,8 +5344,8 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5357,9 +5357,9 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5372,7 +5372,7 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB115_1; ; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5391,8 +5391,8 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5404,9 +5404,9 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5419,7 +5419,7 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB116_1; ; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -5438,8 +5438,8 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5451,9 +5451,9 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5466,7 +5466,7 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB117_1; ; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -5485,8 +5485,8 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5498,9 +5498,9 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5513,7 +5513,7 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB118_1; ; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -5532,8 +5532,8 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5545,9 +5545,9 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5560,7 +5560,7 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB119_1; ; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5579,8 +5579,8 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5592,9 +5592,9 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5607,7 +5607,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB120_1; ; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -5626,8 +5626,8 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5639,9 +5639,9 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5654,7 +5654,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB121_1; ; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -5673,8 +5673,8 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5686,9 +5686,9 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5701,7 +5701,7 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB122_1; ; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -5720,8 +5720,8 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5733,9 +5733,9 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5748,7 +5748,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB123_1; ; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5767,8 +5767,8 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5780,9 +5780,9 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5795,7 +5795,7 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB124_1; ; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -5814,8 +5814,8 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5827,9 +5827,9 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5842,7 +5842,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB125_1; ; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -5861,8 +5861,8 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5874,9 +5874,9 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5889,7 +5889,7 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB126_1; ; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -5908,8 +5908,8 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5921,9 +5921,9 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5936,7 +5936,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB127_1; ; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -5955,8 +5955,8 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -5968,9 +5968,9 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -5983,7 +5983,7 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB128_1; ; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6002,8 +6002,8 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6015,9 +6015,9 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6030,7 +6030,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB129_1; ; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6049,8 +6049,8 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6062,9 +6062,9 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6077,7 +6077,7 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB130_1; ; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -6096,8 +6096,8 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6109,9 +6109,9 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6124,7 +6124,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB131_1; ; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -6143,8 +6143,8 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6156,9 +6156,9 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6171,7 +6171,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB132_1; ; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6190,8 +6190,8 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6203,9 +6203,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6218,7 +6218,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB133_1; ; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6237,8 +6237,8 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6250,9 +6250,9 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6265,7 +6265,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB134_1; ; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -6284,8 +6284,8 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6297,9 +6297,9 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6312,7 +6312,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB135_1; ; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -6331,8 +6331,8 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6344,9 +6344,9 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6359,7 +6359,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB136_1; ; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6378,8 +6378,8 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6391,9 +6391,9 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6406,7 +6406,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB137_1; ; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6425,8 +6425,8 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6438,9 +6438,9 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6453,7 +6453,7 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB138_1; ; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -6472,8 +6472,8 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6485,9 +6485,9 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6500,7 +6500,7 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB139_1; ; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -6519,8 +6519,8 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6532,9 +6532,9 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6547,7 +6547,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB140_1; ; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6566,8 +6566,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6579,9 +6579,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6594,7 +6594,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB141_1; ; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6613,8 +6613,8 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6626,9 +6626,9 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6641,7 +6641,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB142_1; ; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -6660,8 +6660,8 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6673,9 +6673,9 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6688,7 +6688,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB143_1; ; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -6707,8 +6707,8 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6720,9 +6720,9 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6735,7 +6735,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB144_1; ; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6754,8 +6754,8 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6767,9 +6767,9 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6782,7 +6782,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB145_1; ; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6801,8 +6801,8 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6814,9 +6814,9 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6829,7 +6829,7 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB146_1; ; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -6848,8 +6848,8 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6861,9 +6861,9 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6876,7 +6876,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB147_1; ; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -6895,8 +6895,8 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6908,9 +6908,9 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6923,7 +6923,7 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB148_1; ; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -6942,8 +6942,8 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -6955,9 +6955,9 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -6970,7 +6970,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB149_1; ; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -6989,8 +6989,8 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7002,9 +7002,9 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7017,7 +7017,7 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB150_1; ; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7036,8 +7036,8 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7049,9 +7049,9 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7064,7 +7064,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB151_1; ; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -7083,8 +7083,8 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7096,9 +7096,9 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7111,7 +7111,7 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB152_1; ; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -7130,8 +7130,8 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7143,9 +7143,9 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7158,7 +7158,7 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB153_1; ; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -7177,8 +7177,8 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7190,9 +7190,9 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7205,7 +7205,7 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB154_1; ; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7224,8 +7224,8 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7237,9 +7237,9 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7252,7 +7252,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB155_1; ; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -7271,8 +7271,8 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7284,9 +7284,9 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7299,7 +7299,7 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB156_1; ; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -7318,8 +7318,8 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7331,9 +7331,9 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7346,7 +7346,7 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB157_1; ; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -7365,8 +7365,8 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7378,9 +7378,9 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7393,7 +7393,7 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB158_1; ; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7412,8 +7412,8 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7425,9 +7425,9 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7440,7 +7440,7 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB159_1; ; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -7459,8 +7459,8 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7472,9 +7472,9 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7487,7 +7487,7 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB160_1; ; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -7506,8 +7506,8 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7519,9 +7519,9 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7534,7 +7534,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB161_1; ; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -7553,8 +7553,8 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7566,9 +7566,9 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7581,7 +7581,7 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB162_1; ; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7600,8 +7600,8 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7613,9 +7613,9 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7628,7 +7628,7 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB163_1; ; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -7647,8 +7647,8 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7660,9 +7660,9 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7675,7 +7675,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB164_1; ; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -7694,8 +7694,8 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7707,9 +7707,9 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7722,7 +7722,7 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB165_1; ; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -7741,8 +7741,8 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7754,9 +7754,9 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7769,7 +7769,7 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB166_1; ; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7788,8 +7788,8 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7801,9 +7801,9 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7816,7 +7816,7 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB167_1; ; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -7835,8 +7835,8 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7848,9 +7848,9 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7863,7 +7863,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB168_1; ; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -7882,8 +7882,8 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7895,9 +7895,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7910,7 +7910,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB169_1; ; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -7929,8 +7929,8 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7942,9 +7942,9 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -7957,7 +7957,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB170_1; ; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -7976,8 +7976,8 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -7989,9 +7989,9 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8004,7 +8004,7 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB171_1; ; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -8023,8 +8023,8 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8036,9 +8036,9 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8051,7 +8051,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB172_1; ; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -8070,8 +8070,8 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8083,9 +8083,9 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8098,7 +8098,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB173_1; ; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -8117,8 +8117,8 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8130,9 +8130,9 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8145,7 +8145,7 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB174_1; ; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -8164,8 +8164,8 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8177,9 +8177,9 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8192,7 +8192,7 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB175_1; ; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -8211,8 +8211,8 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8224,9 +8224,9 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8239,7 +8239,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB176_1; ; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -8258,8 +8258,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8271,9 +8271,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8286,7 +8286,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB177_1; ; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -8305,8 +8305,8 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8318,9 +8318,9 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8333,7 +8333,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB178_1; ; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -8352,8 +8352,8 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -8365,9 +8365,9 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8380,7 +8380,7 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; -; SM90-NEXT: mov.u32 %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB179_1; ; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -8399,10 +8399,10 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8412,7 +8412,7 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8425,7 +8425,7 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB180_1; ; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8443,10 +8443,10 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8456,7 +8456,7 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8469,7 +8469,7 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB181_1; ; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8487,10 +8487,10 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8500,7 +8500,7 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8513,7 +8513,7 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB182_1; ; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8531,10 +8531,10 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8544,7 +8544,7 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8557,7 +8557,7 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB183_1; ; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8575,10 +8575,10 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8588,7 +8588,7 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8601,7 +8601,7 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB184_1; ; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8619,10 +8619,10 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8632,7 +8632,7 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8645,7 +8645,7 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB185_1; ; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8663,10 +8663,10 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8676,7 +8676,7 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8689,7 +8689,7 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB186_1; ; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8707,10 +8707,10 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8720,7 +8720,7 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8733,7 +8733,7 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB187_1; ; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8751,10 +8751,10 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8764,7 +8764,7 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8777,7 +8777,7 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB188_1; ; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8795,10 +8795,10 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8808,7 +8808,7 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8821,7 +8821,7 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB189_1; ; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8839,10 +8839,10 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8852,7 +8852,7 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8865,7 +8865,7 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB190_1; ; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8883,10 +8883,10 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8896,7 +8896,7 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8909,7 +8909,7 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB191_1; ; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -8927,10 +8927,10 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8940,7 +8940,7 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8953,7 +8953,7 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB192_1; ; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -8972,10 +8972,10 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -8985,7 +8985,7 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -8998,7 +8998,7 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB193_1; ; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9017,10 +9017,10 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9030,7 +9030,7 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9043,7 +9043,7 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB194_1; ; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9062,10 +9062,10 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9075,7 +9075,7 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9088,7 +9088,7 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB195_1; ; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -9107,10 +9107,10 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9120,7 +9120,7 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9133,7 +9133,7 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB196_1; ; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -9152,10 +9152,10 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9165,7 +9165,7 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9178,7 +9178,7 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB197_1; ; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9197,10 +9197,10 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9210,7 +9210,7 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9223,7 +9223,7 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB198_1; ; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9242,10 +9242,10 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9255,7 +9255,7 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9268,7 +9268,7 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB199_1; ; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -9287,10 +9287,10 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9300,7 +9300,7 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9313,7 +9313,7 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB200_1; ; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -9332,10 +9332,10 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9345,7 +9345,7 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9358,7 +9358,7 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB201_1; ; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9377,10 +9377,10 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9390,7 +9390,7 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9403,7 +9403,7 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB202_1; ; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9422,10 +9422,10 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -9435,7 +9435,7 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9448,7 +9448,7 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB203_1; ; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -9467,10 +9467,10 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9481,7 +9481,7 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9494,7 +9494,7 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB204_1; ; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -9513,10 +9513,10 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9527,7 +9527,7 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9540,7 +9540,7 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB205_1; ; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9559,10 +9559,10 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9573,7 +9573,7 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9586,7 +9586,7 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB206_1; ; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9605,10 +9605,10 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9619,7 +9619,7 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9632,7 +9632,7 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB207_1; ; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -9651,10 +9651,10 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9665,7 +9665,7 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9678,7 +9678,7 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB208_1; ; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -9697,10 +9697,10 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9711,7 +9711,7 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9724,7 +9724,7 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB209_1; ; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9743,10 +9743,10 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9757,7 +9757,7 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9770,7 +9770,7 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB210_1; ; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9789,10 +9789,10 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9803,7 +9803,7 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9816,7 +9816,7 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB211_1; ; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -9835,10 +9835,10 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9849,7 +9849,7 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9862,7 +9862,7 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB212_1; ; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -9881,10 +9881,10 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9895,7 +9895,7 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9908,7 +9908,7 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB213_1; ; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -9927,10 +9927,10 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9941,7 +9941,7 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -9954,7 +9954,7 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB214_1; ; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -9973,10 +9973,10 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -9987,7 +9987,7 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10000,7 +10000,7 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB215_1; ; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10019,10 +10019,10 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10032,7 +10032,7 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10045,7 +10045,7 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB216_1; ; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10064,10 +10064,10 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10077,7 +10077,7 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10090,7 +10090,7 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB217_1; ; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -10109,10 +10109,10 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10122,7 +10122,7 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10135,7 +10135,7 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB218_1; ; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -10154,10 +10154,10 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10167,7 +10167,7 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10180,7 +10180,7 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB219_1; ; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10199,10 +10199,10 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10212,7 +10212,7 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10225,7 +10225,7 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB220_1; ; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10244,10 +10244,10 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10257,7 +10257,7 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10270,7 +10270,7 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB221_1; ; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -10289,10 +10289,10 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10302,7 +10302,7 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10315,7 +10315,7 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB222_1; ; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -10334,10 +10334,10 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10347,7 +10347,7 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10360,7 +10360,7 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB223_1; ; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10379,10 +10379,10 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10392,7 +10392,7 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10405,7 +10405,7 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB224_1; ; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10424,10 +10424,10 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10437,7 +10437,7 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10450,7 +10450,7 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB225_1; ; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -10469,10 +10469,10 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10482,7 +10482,7 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10495,7 +10495,7 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB226_1; ; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -10514,10 +10514,10 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10527,7 +10527,7 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10540,7 +10540,7 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB227_1; ; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10559,10 +10559,10 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10572,7 +10572,7 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10585,7 +10585,7 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB228_1; ; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10604,10 +10604,10 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10617,7 +10617,7 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10630,7 +10630,7 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB229_1; ; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -10649,10 +10649,10 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10662,7 +10662,7 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10675,7 +10675,7 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB230_1; ; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -10694,10 +10694,10 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10707,7 +10707,7 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10720,7 +10720,7 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB231_1; ; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10739,10 +10739,10 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10752,7 +10752,7 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10765,7 +10765,7 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB232_1; ; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10784,10 +10784,10 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10797,7 +10797,7 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10810,7 +10810,7 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB233_1; ; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -10829,10 +10829,10 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10842,7 +10842,7 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10855,7 +10855,7 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB234_1; ; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -10874,10 +10874,10 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10887,7 +10887,7 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10900,7 +10900,7 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB235_1; ; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -10919,10 +10919,10 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10932,7 +10932,7 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10945,7 +10945,7 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB236_1; ; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -10964,10 +10964,10 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -10977,7 +10977,7 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -10990,7 +10990,7 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB237_1; ; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -11009,10 +11009,10 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11022,7 +11022,7 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11035,7 +11035,7 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB238_1; ; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -11054,10 +11054,10 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -11067,7 +11067,7 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11080,7 +11080,7 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB239_1; ; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -11099,10 +11099,10 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11113,7 +11113,7 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11126,7 +11126,7 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB240_1; ; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -11145,10 +11145,10 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11159,7 +11159,7 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11172,7 +11172,7 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB241_1; ; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -11191,10 +11191,10 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11205,7 +11205,7 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11218,7 +11218,7 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB242_1; ; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -11237,10 +11237,10 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11251,7 +11251,7 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11264,7 +11264,7 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB243_1; ; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -11283,10 +11283,10 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11297,7 +11297,7 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11310,7 +11310,7 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB244_1; ; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -11329,10 +11329,10 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11343,7 +11343,7 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11356,7 +11356,7 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB245_1; ; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -11375,10 +11375,10 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11389,7 +11389,7 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11402,7 +11402,7 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB246_1; ; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -11421,10 +11421,10 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11435,7 +11435,7 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11448,7 +11448,7 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB247_1; ; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -11467,10 +11467,10 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11481,7 +11481,7 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11494,7 +11494,7 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB248_1; ; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -11513,10 +11513,10 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11527,7 +11527,7 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11540,7 +11540,7 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB249_1; ; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -11559,10 +11559,10 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11573,7 +11573,7 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11586,7 +11586,7 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB250_1; ; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -11605,10 +11605,10 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11619,7 +11619,7 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11632,7 +11632,7 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB251_1; ; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -11651,10 +11651,10 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11665,7 +11665,7 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11678,7 +11678,7 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB252_1; ; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11696,10 +11696,10 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11710,7 +11710,7 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11723,7 +11723,7 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB253_1; ; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11741,10 +11741,10 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11755,7 +11755,7 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11768,7 +11768,7 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB254_1; ; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11786,10 +11786,10 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11800,7 +11800,7 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11813,7 +11813,7 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB255_1; ; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11831,10 +11831,10 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11845,7 +11845,7 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11858,7 +11858,7 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB256_1; ; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11876,10 +11876,10 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11890,7 +11890,7 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11903,7 +11903,7 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB257_1; ; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11921,10 +11921,10 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11935,7 +11935,7 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11948,7 +11948,7 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB258_1; ; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -11966,10 +11966,10 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -11980,7 +11980,7 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -11993,7 +11993,7 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB259_1; ; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -12011,10 +12011,10 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12025,7 +12025,7 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12038,7 +12038,7 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB260_1; ; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -12056,10 +12056,10 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12070,7 +12070,7 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12083,7 +12083,7 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB261_1; ; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -12101,10 +12101,10 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12115,7 +12115,7 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12128,7 +12128,7 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB262_1; ; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -12146,10 +12146,10 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12160,7 +12160,7 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12173,7 +12173,7 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB263_1; ; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -12191,10 +12191,10 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12205,7 +12205,7 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12218,7 +12218,7 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB264_1; ; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -12237,10 +12237,10 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12251,7 +12251,7 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12264,7 +12264,7 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB265_1; ; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -12283,10 +12283,10 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12297,7 +12297,7 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12310,7 +12310,7 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB266_1; ; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -12329,10 +12329,10 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12343,7 +12343,7 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12356,7 +12356,7 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB267_1; ; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -12375,10 +12375,10 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12389,7 +12389,7 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12402,7 +12402,7 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB268_1; ; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -12421,10 +12421,10 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12435,7 +12435,7 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12448,7 +12448,7 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB269_1; ; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -12467,10 +12467,10 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12481,7 +12481,7 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12494,7 +12494,7 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB270_1; ; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -12513,10 +12513,10 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12527,7 +12527,7 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12540,7 +12540,7 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB271_1; ; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -12559,10 +12559,10 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12573,7 +12573,7 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12586,7 +12586,7 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB272_1; ; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -12605,10 +12605,10 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12619,7 +12619,7 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12632,7 +12632,7 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB273_1; ; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -12651,10 +12651,10 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12665,7 +12665,7 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12678,7 +12678,7 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB274_1; ; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -12697,10 +12697,10 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12711,7 +12711,7 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12724,7 +12724,7 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB275_1; ; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -12743,10 +12743,10 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12757,7 +12757,7 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12770,7 +12770,7 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB276_1; ; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -12789,10 +12789,10 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12803,7 +12803,7 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12816,7 +12816,7 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB277_1; ; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -12835,10 +12835,10 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12849,7 +12849,7 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12862,7 +12862,7 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB278_1; ; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -12881,10 +12881,10 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12895,7 +12895,7 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12908,7 +12908,7 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB279_1; ; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -12927,10 +12927,10 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12941,7 +12941,7 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -12954,7 +12954,7 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB280_1; ; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -12973,10 +12973,10 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -12987,7 +12987,7 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13000,7 +13000,7 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB281_1; ; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13019,10 +13019,10 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13033,7 +13033,7 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13046,7 +13046,7 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB282_1; ; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13065,10 +13065,10 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13079,7 +13079,7 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13092,7 +13092,7 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB283_1; ; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -13111,10 +13111,10 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13125,7 +13125,7 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13138,7 +13138,7 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB284_1; ; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -13157,10 +13157,10 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13171,7 +13171,7 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13184,7 +13184,7 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB285_1; ; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13203,10 +13203,10 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13217,7 +13217,7 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13230,7 +13230,7 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB286_1; ; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13249,10 +13249,10 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13263,7 +13263,7 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13276,7 +13276,7 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB287_1; ; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -13295,10 +13295,10 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13309,7 +13309,7 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13322,7 +13322,7 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB288_1; ; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -13341,10 +13341,10 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13355,7 +13355,7 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13368,7 +13368,7 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB289_1; ; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13387,10 +13387,10 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13401,7 +13401,7 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13414,7 +13414,7 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB290_1; ; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13433,10 +13433,10 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13447,7 +13447,7 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13460,7 +13460,7 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB291_1; ; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -13479,10 +13479,10 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13493,7 +13493,7 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13506,7 +13506,7 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB292_1; ; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -13525,10 +13525,10 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13539,7 +13539,7 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13552,7 +13552,7 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB293_1; ; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13571,10 +13571,10 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13585,7 +13585,7 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13598,7 +13598,7 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB294_1; ; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13617,10 +13617,10 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13631,7 +13631,7 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13644,7 +13644,7 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB295_1; ; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -13663,10 +13663,10 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13677,7 +13677,7 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13690,7 +13690,7 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB296_1; ; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -13709,10 +13709,10 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13723,7 +13723,7 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13736,7 +13736,7 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB297_1; ; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13755,10 +13755,10 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13769,7 +13769,7 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13782,7 +13782,7 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB298_1; ; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13801,10 +13801,10 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13815,7 +13815,7 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13828,7 +13828,7 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB299_1; ; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -13847,10 +13847,10 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13861,7 +13861,7 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13874,7 +13874,7 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB300_1; ; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -13893,10 +13893,10 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13907,7 +13907,7 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13920,7 +13920,7 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB301_1; ; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -13939,10 +13939,10 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13953,7 +13953,7 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -13966,7 +13966,7 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB302_1; ; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -13985,10 +13985,10 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -13999,7 +13999,7 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14012,7 +14012,7 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB303_1; ; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14031,10 +14031,10 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14045,7 +14045,7 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14058,7 +14058,7 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB304_1; ; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14077,10 +14077,10 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14091,7 +14091,7 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14104,7 +14104,7 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB305_1; ; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -14123,10 +14123,10 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14137,7 +14137,7 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14150,7 +14150,7 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB306_1; ; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -14169,10 +14169,10 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14183,7 +14183,7 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14196,7 +14196,7 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB307_1; ; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14215,10 +14215,10 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14229,7 +14229,7 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14242,7 +14242,7 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB308_1; ; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14261,10 +14261,10 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14275,7 +14275,7 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14288,7 +14288,7 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB309_1; ; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -14307,10 +14307,10 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14321,7 +14321,7 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14334,7 +14334,7 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB310_1; ; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -14353,10 +14353,10 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14367,7 +14367,7 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14380,7 +14380,7 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB311_1; ; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14399,10 +14399,10 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14413,7 +14413,7 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14426,7 +14426,7 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB312_1; ; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14445,10 +14445,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14459,7 +14459,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14472,7 +14472,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB313_1; ; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -14491,10 +14491,10 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14505,7 +14505,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14518,7 +14518,7 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB314_1; ; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -14537,10 +14537,10 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14551,7 +14551,7 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14564,7 +14564,7 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB315_1; ; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14583,10 +14583,10 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14597,7 +14597,7 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14610,7 +14610,7 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB316_1; ; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14629,10 +14629,10 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14643,7 +14643,7 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14656,7 +14656,7 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB317_1; ; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -14675,10 +14675,10 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14689,7 +14689,7 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14702,7 +14702,7 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB318_1; ; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -14721,10 +14721,10 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14735,7 +14735,7 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14748,7 +14748,7 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB319_1; ; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14767,10 +14767,10 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14781,7 +14781,7 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14794,7 +14794,7 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB320_1; ; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14813,10 +14813,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14827,7 +14827,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14840,7 +14840,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB321_1; ; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -14859,10 +14859,10 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14873,7 +14873,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14886,7 +14886,7 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB322_1; ; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -14905,10 +14905,10 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14919,7 +14919,7 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14932,7 +14932,7 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB323_1; ; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -14951,10 +14951,10 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -14965,7 +14965,7 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -14978,7 +14978,7 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB324_1; ; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -14997,10 +14997,10 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15011,7 +15011,7 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15024,7 +15024,7 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB325_1; ; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15043,10 +15043,10 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15057,7 +15057,7 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15070,7 +15070,7 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB326_1; ; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -15089,10 +15089,10 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15103,7 +15103,7 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15116,7 +15116,7 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB327_1; ; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -15135,10 +15135,10 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15149,7 +15149,7 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15162,7 +15162,7 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB328_1; ; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -15181,10 +15181,10 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15195,7 +15195,7 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15208,7 +15208,7 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB329_1; ; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15227,10 +15227,10 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15241,7 +15241,7 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15254,7 +15254,7 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB330_1; ; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -15273,10 +15273,10 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15287,7 +15287,7 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15300,7 +15300,7 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB331_1; ; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -15319,10 +15319,10 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15333,7 +15333,7 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15346,7 +15346,7 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB332_1; ; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -15365,10 +15365,10 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15379,7 +15379,7 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15392,7 +15392,7 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB333_1; ; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15411,10 +15411,10 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15425,7 +15425,7 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15438,7 +15438,7 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB334_1; ; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -15457,10 +15457,10 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15471,7 +15471,7 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15484,7 +15484,7 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB335_1; ; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -15503,10 +15503,10 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15517,7 +15517,7 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15530,7 +15530,7 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB336_1; ; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -15549,10 +15549,10 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15563,7 +15563,7 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15576,7 +15576,7 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB337_1; ; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15595,10 +15595,10 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15609,7 +15609,7 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15622,7 +15622,7 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB338_1; ; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -15641,10 +15641,10 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15655,7 +15655,7 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15668,7 +15668,7 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB339_1; ; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -15687,10 +15687,10 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15701,7 +15701,7 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15714,7 +15714,7 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB340_1; ; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -15733,10 +15733,10 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15747,7 +15747,7 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15760,7 +15760,7 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB341_1; ; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15779,10 +15779,10 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15793,7 +15793,7 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15806,7 +15806,7 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB342_1; ; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -15825,10 +15825,10 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15839,7 +15839,7 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15852,7 +15852,7 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB343_1; ; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -15871,10 +15871,10 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15885,7 +15885,7 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15898,7 +15898,7 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB344_1; ; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -15917,10 +15917,10 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15931,7 +15931,7 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15944,7 +15944,7 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB345_1; ; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -15963,10 +15963,10 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -15977,7 +15977,7 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -15990,7 +15990,7 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB346_1; ; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -16009,10 +16009,10 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16023,7 +16023,7 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16036,7 +16036,7 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB347_1; ; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -16055,10 +16055,10 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16069,7 +16069,7 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16082,7 +16082,7 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB348_1; ; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -16101,10 +16101,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16115,7 +16115,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16128,7 +16128,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB349_1; ; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -16147,10 +16147,10 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16161,7 +16161,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16174,7 +16174,7 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB350_1; ; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -16193,10 +16193,10 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16207,7 +16207,7 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16220,7 +16220,7 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB351_1; ; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -16239,10 +16239,10 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16253,7 +16253,7 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16266,7 +16266,7 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB352_1; ; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -16285,10 +16285,10 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16299,7 +16299,7 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16312,7 +16312,7 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB353_1; ; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -16331,10 +16331,10 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16345,7 +16345,7 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16358,7 +16358,7 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB354_1; ; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -16377,10 +16377,10 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16391,7 +16391,7 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16404,7 +16404,7 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB355_1; ; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -16423,10 +16423,10 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16437,7 +16437,7 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16450,7 +16450,7 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB356_1; ; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -16469,10 +16469,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16483,7 +16483,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16496,7 +16496,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB357_1; ; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -16515,10 +16515,10 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16529,7 +16529,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16542,7 +16542,7 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB358_1; ; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -16561,10 +16561,10 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -16575,7 +16575,7 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -16588,7 +16588,7 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; ; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; -; SM90-NEXT: mov.u32 %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB359_1; ; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -16605,9 +16605,9 @@ define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16622,9 +16622,9 @@ define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16639,9 +16639,9 @@ define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16656,9 +16656,9 @@ define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16673,9 +16673,9 @@ define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16690,9 +16690,9 @@ define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16707,9 +16707,9 @@ define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16724,9 +16724,9 @@ define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16741,9 +16741,9 @@ define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16758,9 +16758,9 @@ define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16775,9 +16775,9 @@ define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16792,9 +16792,9 @@ define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16809,9 +16809,9 @@ define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16826,9 +16826,9 @@ define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16843,9 +16843,9 @@ define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16860,9 +16860,9 @@ define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16877,9 +16877,9 @@ define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16894,9 +16894,9 @@ define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16911,9 +16911,9 @@ define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16928,9 +16928,9 @@ define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16945,9 +16945,9 @@ define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16962,9 +16962,9 @@ define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16979,9 +16979,9 @@ define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -16996,9 +16996,9 @@ define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17013,10 +17013,10 @@ define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17031,10 +17031,10 @@ define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17049,10 +17049,10 @@ define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17067,10 +17067,10 @@ define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17085,10 +17085,10 @@ define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17103,10 +17103,10 @@ define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17121,10 +17121,10 @@ define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17139,10 +17139,10 @@ define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17157,10 +17157,10 @@ define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17175,10 +17175,10 @@ define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17193,10 +17193,10 @@ define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17211,10 +17211,10 @@ define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17229,9 +17229,9 @@ define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17246,9 +17246,9 @@ define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17263,9 +17263,9 @@ define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17280,9 +17280,9 @@ define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17297,9 +17297,9 @@ define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17314,9 +17314,9 @@ define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17331,9 +17331,9 @@ define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17348,9 +17348,9 @@ define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17365,9 +17365,9 @@ define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17382,9 +17382,9 @@ define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17399,9 +17399,9 @@ define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17416,9 +17416,9 @@ define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17433,9 +17433,9 @@ define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17450,9 +17450,9 @@ define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17467,9 +17467,9 @@ define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17484,9 +17484,9 @@ define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17501,9 +17501,9 @@ define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17518,9 +17518,9 @@ define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17535,9 +17535,9 @@ define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17552,9 +17552,9 @@ define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17569,9 +17569,9 @@ define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17586,9 +17586,9 @@ define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17603,9 +17603,9 @@ define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17620,9 +17620,9 @@ define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17637,10 +17637,10 @@ define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17655,10 +17655,10 @@ define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17673,10 +17673,10 @@ define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17691,10 +17691,10 @@ define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17709,10 +17709,10 @@ define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17727,10 +17727,10 @@ define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17745,10 +17745,10 @@ define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17763,10 +17763,10 @@ define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17781,10 +17781,10 @@ define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17799,10 +17799,10 @@ define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17817,10 +17817,10 @@ define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17835,10 +17835,10 @@ define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17853,9 +17853,9 @@ define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17870,9 +17870,9 @@ define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17887,9 +17887,9 @@ define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17904,9 +17904,9 @@ define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17921,9 +17921,9 @@ define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17938,9 +17938,9 @@ define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17955,9 +17955,9 @@ define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17972,9 +17972,9 @@ define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -17989,9 +17989,9 @@ define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18006,9 +18006,9 @@ define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18023,9 +18023,9 @@ define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18040,9 +18040,9 @@ define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18057,9 +18057,9 @@ define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18074,9 +18074,9 @@ define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18091,9 +18091,9 @@ define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18108,9 +18108,9 @@ define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18125,9 +18125,9 @@ define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18142,9 +18142,9 @@ define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18159,9 +18159,9 @@ define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18176,9 +18176,9 @@ define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18193,9 +18193,9 @@ define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18210,9 +18210,9 @@ define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18227,9 +18227,9 @@ define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18244,9 +18244,9 @@ define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18261,10 +18261,10 @@ define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18279,10 +18279,10 @@ define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18297,10 +18297,10 @@ define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18315,10 +18315,10 @@ define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18333,10 +18333,10 @@ define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18351,10 +18351,10 @@ define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18369,10 +18369,10 @@ define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18387,10 +18387,10 @@ define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18405,10 +18405,10 @@ define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18423,10 +18423,10 @@ define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18441,10 +18441,10 @@ define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18459,10 +18459,10 @@ define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18477,9 +18477,9 @@ define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18494,9 +18494,9 @@ define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18511,9 +18511,9 @@ define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18528,9 +18528,9 @@ define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18545,9 +18545,9 @@ define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18562,9 +18562,9 @@ define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18579,9 +18579,9 @@ define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18596,9 +18596,9 @@ define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18613,9 +18613,9 @@ define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18630,9 +18630,9 @@ define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18647,9 +18647,9 @@ define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18664,9 +18664,9 @@ define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18681,9 +18681,9 @@ define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18698,9 +18698,9 @@ define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18715,9 +18715,9 @@ define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18732,9 +18732,9 @@ define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18749,9 +18749,9 @@ define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18766,9 +18766,9 @@ define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18783,9 +18783,9 @@ define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18800,9 +18800,9 @@ define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18817,9 +18817,9 @@ define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18834,9 +18834,9 @@ define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18851,9 +18851,9 @@ define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18868,9 +18868,9 @@ define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18885,10 +18885,10 @@ define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18903,10 +18903,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18921,10 +18921,10 @@ define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18939,10 +18939,10 @@ define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18957,10 +18957,10 @@ define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18975,10 +18975,10 @@ define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -18993,10 +18993,10 @@ define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19011,10 +19011,10 @@ define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19029,10 +19029,10 @@ define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19047,10 +19047,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19065,10 +19065,10 @@ define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19083,10 +19083,10 @@ define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19101,10 +19101,10 @@ define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19119,10 +19119,10 @@ define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19137,10 +19137,10 @@ define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19155,10 +19155,10 @@ define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19173,10 +19173,10 @@ define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19191,10 +19191,10 @@ define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19209,10 +19209,10 @@ define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19227,10 +19227,10 @@ define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19245,10 +19245,10 @@ define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19263,10 +19263,10 @@ define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19281,10 +19281,10 @@ define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cm ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19299,10 +19299,10 @@ define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19317,10 +19317,10 @@ define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19335,10 +19335,10 @@ define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19353,10 +19353,10 @@ define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19371,10 +19371,10 @@ define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19389,10 +19389,10 @@ define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19407,10 +19407,10 @@ define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19425,10 +19425,10 @@ define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19443,10 +19443,10 @@ define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19461,10 +19461,10 @@ define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19479,10 +19479,10 @@ define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19497,10 +19497,10 @@ define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19515,10 +19515,10 @@ define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19533,10 +19533,10 @@ define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19551,10 +19551,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19569,10 +19569,10 @@ define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19587,10 +19587,10 @@ define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19605,10 +19605,10 @@ define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19623,10 +19623,10 @@ define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19641,10 +19641,10 @@ define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19659,10 +19659,10 @@ define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19677,10 +19677,10 @@ define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19695,10 +19695,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19713,10 +19713,10 @@ define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19731,10 +19731,10 @@ define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -19748,9 +19748,9 @@ define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19764,9 +19764,9 @@ define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19780,9 +19780,9 @@ define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19796,9 +19796,9 @@ define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19812,9 +19812,9 @@ define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19828,9 +19828,9 @@ define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19844,9 +19844,9 @@ define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19860,9 +19860,9 @@ define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19876,9 +19876,9 @@ define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19892,9 +19892,9 @@ define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19908,9 +19908,9 @@ define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19924,9 +19924,9 @@ define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19940,9 +19940,9 @@ define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19956,9 +19956,9 @@ define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19972,9 +19972,9 @@ define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -19988,9 +19988,9 @@ define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20004,9 +20004,9 @@ define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20020,9 +20020,9 @@ define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20036,9 +20036,9 @@ define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20052,9 +20052,9 @@ define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20068,9 +20068,9 @@ define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20084,9 +20084,9 @@ define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20100,9 +20100,9 @@ define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20116,9 +20116,9 @@ define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20132,10 +20132,10 @@ define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20149,10 +20149,10 @@ define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20166,10 +20166,10 @@ define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20183,10 +20183,10 @@ define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20200,10 +20200,10 @@ define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20217,10 +20217,10 @@ define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20234,10 +20234,10 @@ define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20251,10 +20251,10 @@ define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20268,10 +20268,10 @@ define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20285,10 +20285,10 @@ define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20302,10 +20302,10 @@ define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20319,10 +20319,10 @@ define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20336,9 +20336,9 @@ define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20352,9 +20352,9 @@ define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20368,9 +20368,9 @@ define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20384,9 +20384,9 @@ define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20400,9 +20400,9 @@ define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20416,9 +20416,9 @@ define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20432,9 +20432,9 @@ define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20448,9 +20448,9 @@ define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20464,9 +20464,9 @@ define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20480,9 +20480,9 @@ define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20496,9 +20496,9 @@ define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20512,9 +20512,9 @@ define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20528,9 +20528,9 @@ define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20544,9 +20544,9 @@ define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20560,9 +20560,9 @@ define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20576,9 +20576,9 @@ define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20592,9 +20592,9 @@ define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20608,9 +20608,9 @@ define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20624,9 +20624,9 @@ define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20640,9 +20640,9 @@ define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20656,9 +20656,9 @@ define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20672,9 +20672,9 @@ define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20688,9 +20688,9 @@ define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20704,9 +20704,9 @@ define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20720,10 +20720,10 @@ define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20737,10 +20737,10 @@ define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20754,10 +20754,10 @@ define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20771,10 +20771,10 @@ define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20788,10 +20788,10 @@ define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20805,10 +20805,10 @@ define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20822,10 +20822,10 @@ define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20839,10 +20839,10 @@ define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20856,10 +20856,10 @@ define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20873,10 +20873,10 @@ define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20890,10 +20890,10 @@ define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20907,10 +20907,10 @@ define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20924,9 +20924,9 @@ define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20940,9 +20940,9 @@ define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20956,9 +20956,9 @@ define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20972,9 +20972,9 @@ define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -20988,9 +20988,9 @@ define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21004,9 +21004,9 @@ define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21020,9 +21020,9 @@ define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21036,9 +21036,9 @@ define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21052,9 +21052,9 @@ define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21068,9 +21068,9 @@ define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21084,9 +21084,9 @@ define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21100,9 +21100,9 @@ define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21116,9 +21116,9 @@ define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21132,9 +21132,9 @@ define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21148,9 +21148,9 @@ define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21164,9 +21164,9 @@ define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21180,9 +21180,9 @@ define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21196,9 +21196,9 @@ define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21212,9 +21212,9 @@ define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21228,9 +21228,9 @@ define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21244,9 +21244,9 @@ define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21260,9 +21260,9 @@ define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21276,9 +21276,9 @@ define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21292,9 +21292,9 @@ define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21308,10 +21308,10 @@ define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21325,10 +21325,10 @@ define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21342,10 +21342,10 @@ define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21359,10 +21359,10 @@ define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21376,10 +21376,10 @@ define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21393,10 +21393,10 @@ define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21410,10 +21410,10 @@ define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21427,10 +21427,10 @@ define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21444,10 +21444,10 @@ define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21461,10 +21461,10 @@ define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21478,10 +21478,10 @@ define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21495,10 +21495,10 @@ define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21512,9 +21512,9 @@ define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21528,9 +21528,9 @@ define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21544,9 +21544,9 @@ define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21560,9 +21560,9 @@ define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21576,9 +21576,9 @@ define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21592,9 +21592,9 @@ define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21608,9 +21608,9 @@ define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21624,9 +21624,9 @@ define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21640,9 +21640,9 @@ define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21656,9 +21656,9 @@ define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21672,9 +21672,9 @@ define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21688,9 +21688,9 @@ define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21704,9 +21704,9 @@ define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21720,9 +21720,9 @@ define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21736,9 +21736,9 @@ define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21752,9 +21752,9 @@ define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21768,9 +21768,9 @@ define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21784,9 +21784,9 @@ define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21800,9 +21800,9 @@ define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21816,9 +21816,9 @@ define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21832,9 +21832,9 @@ define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21848,9 +21848,9 @@ define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21864,9 +21864,9 @@ define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21880,9 +21880,9 @@ define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21896,10 +21896,10 @@ define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21913,10 +21913,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21930,10 +21930,10 @@ define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21947,10 +21947,10 @@ define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21964,10 +21964,10 @@ define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21981,10 +21981,10 @@ define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -21998,10 +21998,10 @@ define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22015,10 +22015,10 @@ define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22032,10 +22032,10 @@ define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22049,10 +22049,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22066,10 +22066,10 @@ define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22083,10 +22083,10 @@ define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22100,10 +22100,10 @@ define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22117,10 +22117,10 @@ define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22134,10 +22134,10 @@ define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22151,10 +22151,10 @@ define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22168,10 +22168,10 @@ define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22185,10 +22185,10 @@ define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22202,10 +22202,10 @@ define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22219,10 +22219,10 @@ define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22236,10 +22236,10 @@ define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22253,10 +22253,10 @@ define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22270,10 +22270,10 @@ define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cm ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22287,10 +22287,10 @@ define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22304,10 +22304,10 @@ define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22321,10 +22321,10 @@ define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22338,10 +22338,10 @@ define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22355,10 +22355,10 @@ define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22372,10 +22372,10 @@ define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22389,10 +22389,10 @@ define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22406,10 +22406,10 @@ define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22423,10 +22423,10 @@ define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22440,10 +22440,10 @@ define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22457,10 +22457,10 @@ define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22474,10 +22474,10 @@ define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22491,10 +22491,10 @@ define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22508,10 +22508,10 @@ define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22525,10 +22525,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22542,10 +22542,10 @@ define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22559,10 +22559,10 @@ define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22576,10 +22576,10 @@ define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22593,10 +22593,10 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22610,10 +22610,10 @@ define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22627,10 +22627,10 @@ define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22644,10 +22644,10 @@ define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; ; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22661,10 +22661,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22678,10 +22678,10 @@ define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; ; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -22695,10 +22695,10 @@ define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; ; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index f5c22664394b5..bff37d4cbba87 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -18,8 +18,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; ; SM30-NEXT: and.b32 %r10, %r9, 3; @@ -30,9 +30,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -59,8 +59,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -71,9 +71,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -144,8 +144,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; ; SM30-NEXT: and.b32 %r10, %r9, 3; @@ -156,9 +156,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -186,8 +186,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -198,9 +198,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -273,8 +273,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -286,9 +286,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -315,8 +315,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -328,9 +328,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -402,8 +402,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -415,9 +415,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -445,8 +445,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -458,9 +458,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -534,8 +534,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -547,9 +547,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -577,8 +577,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -590,9 +590,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -667,10 +667,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -680,7 +680,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -707,10 +707,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -720,7 +720,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -790,10 +790,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -803,7 +803,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -831,10 +831,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -844,7 +844,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -916,10 +916,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -930,7 +930,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -957,10 +957,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -971,7 +971,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1042,10 +1042,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1056,7 +1056,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1084,10 +1084,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1098,7 +1098,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1172,10 +1172,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1186,7 +1186,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1214,10 +1214,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1228,7 +1228,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1300,9 +1300,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1313,9 +1313,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1342,9 +1342,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1355,9 +1355,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1384,9 +1384,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1397,9 +1397,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1426,9 +1426,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1439,9 +1439,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; ; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1468,10 +1468,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM30-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1482,10 +1482,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1514,9 +1514,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1526,9 +1526,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; ; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1553,9 +1553,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1565,9 +1565,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1592,9 +1592,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1604,9 +1604,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; ; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1631,9 +1631,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1643,9 +1643,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; ; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1670,10 +1670,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1683,10 +1683,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; From 3722e80a46a7b021a23e497f4f01032964795269 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 21:44:02 +0000 Subject: [PATCH 16/26] clang-format --- llvm/include/llvm/CodeGen/TargetLowering.h | 12 ++++----- llvm/lib/Target/ARM/ARMISelLowering.h | 8 +++--- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 8 +++--- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 2 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 27 +++++++++---------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 10 +++---- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index ad8299ffd41ec..9c3cede359c15 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2325,13 +2325,13 @@ class LLVM_ABI TargetLoweringBase { /// standard ABI uses a fence before a seq_cst load instead of after a /// seq_cst store). /// @{ - virtual Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const; + virtual Instruction *emitLeadingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const; - virtual Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const; + virtual Instruction *emitTrailingFence(IRBuilderBase &Builder, + Instruction *Inst, + AtomicOrdering Ord) const; /// @} // Emits code that executes when the comparison result in the ll/sc diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8ee009c7b2e39..604910e04d4cc 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -674,10 +674,10 @@ class VectorType; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override; - Instruction *emitLeadingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; - Instruction *emitTrailingFence( - IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override; diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 0a56404e6862f..af134f079ee91 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -269,7 +269,7 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, } void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, - raw_ostream &O, StringRef Modifier) { + raw_ostream &O, StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); if (Modifier == "sem") { @@ -321,9 +321,9 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, O << ".gpu"; return; } - report_fatal_error( - formatv("NVPTX AtomicCode Printer does not support \"{}\" scope modifier.", - ScopeToString(S))); + report_fatal_error(formatv( + "NVPTX AtomicCode Printer does not support \"{}\" scope modifier.", + ScopeToString(S))); } else if (Modifier == "addsp") { auto A = NVPTX::AddressSpace(Imm); switch (A) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 9e879c78a6906..c3ff3469150e4 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -41,7 +41,7 @@ class NVPTXInstPrinter : public MCInstPrinter { void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O, - StringRef Modifier = {}); + StringRef Modifier = {}); void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 5cf78454894b0..519fced023e12 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -304,7 +304,6 @@ void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) { } } - bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); switch (IID) { @@ -525,19 +524,19 @@ unsigned int NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { return NVPTX::Ordering::NotAtomic; auto Ordering = N->getMergedOrdering(); switch (Ordering) { - case AtomicOrdering::NotAtomic: - case AtomicOrdering::Unordered: - return NVPTX::Ordering::NotAtomic; - case AtomicOrdering::Monotonic: - return NVPTX::Ordering::Relaxed; - case AtomicOrdering::Acquire: - return NVPTX::Ordering::Acquire; - case AtomicOrdering::Release: - return NVPTX::Ordering::Release; - case AtomicOrdering::AcquireRelease: - return NVPTX::Ordering::AcquireRelease; - case AtomicOrdering::SequentiallyConsistent: - return NVPTX::Ordering::SequentiallyConsistent; + case AtomicOrdering::NotAtomic: + case AtomicOrdering::Unordered: + return NVPTX::Ordering::NotAtomic; + case AtomicOrdering::Monotonic: + return NVPTX::Ordering::Relaxed; + case AtomicOrdering::Acquire: + return NVPTX::Ordering::Acquire; + case AtomicOrdering::Release: + return NVPTX::Ordering::Release; + case AtomicOrdering::AcquireRelease: + return NVPTX::Ordering::AcquireRelease; + case AtomicOrdering::SequentiallyConsistent: + return NVPTX::Ordering::SequentiallyConsistent; } } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 08993c454e201..a1b283e35074a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -232,12 +232,10 @@ class RISCVTargetLowering : public TargetLowering { // than this hook due to limitations in the interface here. bool shouldInsertFencesForAtomic(const Instruction *I) const override; - Instruction * - emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; - Instruction * - emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, - AtomicOrdering Ord) const override; + Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; From dd6cb8c92929fe754ae2614a23c397d6d81d4e18 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 21:58:28 +0000 Subject: [PATCH 17/26] No changes necessary to AtomicExpandPass --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 0e24b3d65ee2e..3f3d5dc90711f 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -314,7 +314,6 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; - SyncScope::ID SSID = SyncScope::System; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); @@ -337,18 +336,13 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { // expandAtomicCmpXchg in that case. FenceOrdering = CASI->getMergedOrdering(); auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI); - SSID = CASI->getSyncScopeID(); CASI->setSuccessOrdering(CASOrdering); CASI->setFailureOrdering(CASOrdering); - // If CAS ordering is monotonic, then the operation will - // take default scope. Otherwise, it will retain its scope - if (CASOrdering != AtomicOrdering::Monotonic) - CASI->setSyncScopeID(SSID); } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID); + MadeChange |= bracketInstWithFences(I, FenceOrdering); } } else if (I->hasAtomicStore() && TLI->shouldInsertTrailingFenceForAtomicStore(I)) { @@ -449,8 +443,7 @@ PreservedAnalyses AtomicExpandPass::run(Function &F, } bool AtomicExpandImpl::bracketInstWithFences(Instruction *I, - AtomicOrdering Order, - SyncScope::ID SSID) { + AtomicOrdering Order) { ReplacementIRBuilder Builder(I, *DL); auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); From 91c38a5c8a067aefc7be7dc231667485c87d6cba Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 22:05:18 +0000 Subject: [PATCH 18/26] formatting --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 3b3d699099b06..be2f73224816f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2029,7 +2029,7 @@ multiclass ST_VEC { (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, O:$src5, O:$src6, O:$src7, O:$src8, - AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth " "\t[$addr], " From 9ff327931a9103c1ee077623b5ee3ebca3bb078a Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 1 Jul 2025 22:10:42 +0000 Subject: [PATCH 19/26] update tests after rebase --- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 4320 +++++++++++------------ llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 3240 ++++++++--------- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 4320 +++++++++++------------ llvm/test/CodeGen/NVPTX/cmpxchg.ll | 240 +- 4 files changed, 6060 insertions(+), 6060 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 8d5800eccef9d..2371af07a151b 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -176,7 +176,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -194,17 +194,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -221,7 +221,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -239,17 +239,17 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -266,7 +266,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -284,17 +284,17 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -311,7 +311,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -329,17 +329,17 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -356,7 +356,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -374,17 +374,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -401,7 +401,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic ret i8 %new @@ -419,17 +419,17 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -446,7 +446,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -464,17 +464,17 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -491,7 +491,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -509,17 +509,17 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -536,7 +536,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -554,17 +554,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -582,7 +582,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -600,17 +600,17 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -628,7 +628,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -646,17 +646,17 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -674,7 +674,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -692,17 +692,17 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -720,7 +720,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -738,17 +738,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -766,7 +766,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -784,17 +784,17 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -812,7 +812,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -830,17 +830,17 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -858,7 +858,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -876,17 +876,17 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -904,7 +904,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -922,17 +922,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -950,7 +950,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire ret i8 %new @@ -968,17 +968,17 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -996,7 +996,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -1014,17 +1014,17 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1042,7 +1042,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -1060,17 +1060,17 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1088,7 +1088,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -1106,18 +1106,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1135,7 +1135,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -1153,18 +1153,18 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1182,7 +1182,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1200,18 +1200,18 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1229,7 +1229,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1247,18 +1247,18 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1276,7 +1276,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1294,18 +1294,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1323,7 +1323,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -1341,18 +1341,18 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1370,7 +1370,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1388,18 +1388,18 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1417,7 +1417,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB30_1; ; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1435,18 +1435,18 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1464,7 +1464,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB31_1; ; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1482,18 +1482,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1511,7 +1511,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB32_1; ; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst ret i8 %new @@ -1529,18 +1529,18 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1558,7 +1558,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB33_1; ; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1576,18 +1576,18 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1605,7 +1605,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB34_1; ; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1623,18 +1623,18 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1652,7 +1652,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB35_1; ; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1670,17 +1670,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1698,7 +1698,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB36_1; ; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -1716,17 +1716,17 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1744,7 +1744,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB37_1; ; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1762,17 +1762,17 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1790,7 +1790,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB38_1; ; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1808,17 +1808,17 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1836,7 +1836,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB39_1; ; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -1854,17 +1854,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1882,7 +1882,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB40_1; ; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -1900,17 +1900,17 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1928,7 +1928,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB41_1; ; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1946,17 +1946,17 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1974,7 +1974,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB42_1; ; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1992,17 +1992,17 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2020,7 +2020,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB43_1; ; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -2038,17 +2038,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2066,7 +2066,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB44_1; ; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic ret i8 %new @@ -2084,17 +2084,17 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop @@ -2112,7 +2112,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB45_1; ; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -2130,17 +2130,17 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop @@ -2158,7 +2158,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB46_1; ; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -2176,17 +2176,17 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop @@ -2204,7 +2204,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB47_1; ; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -2222,17 +2222,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop @@ -2250,7 +2250,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB48_1; ; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -2268,17 +2268,17 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop @@ -2296,7 +2296,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2314,17 +2314,17 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop @@ -2342,7 +2342,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2360,17 +2360,17 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop @@ -2388,7 +2388,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2406,17 +2406,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop @@ -2434,7 +2434,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -2452,17 +2452,17 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop @@ -2480,7 +2480,7 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2498,17 +2498,17 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop @@ -2526,7 +2526,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2544,17 +2544,17 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop @@ -2572,7 +2572,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2590,17 +2590,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop @@ -2618,7 +2618,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire ret i8 %new @@ -2636,17 +2636,17 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop @@ -2664,7 +2664,7 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2682,17 +2682,17 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop @@ -2710,7 +2710,7 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2728,17 +2728,17 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop @@ -2756,7 +2756,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2774,18 +2774,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop @@ -2803,7 +2803,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -2821,18 +2821,18 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop @@ -2850,7 +2850,7 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -2868,18 +2868,18 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop @@ -2897,7 +2897,7 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -2915,18 +2915,18 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop @@ -2944,7 +2944,7 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -2962,18 +2962,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop @@ -2991,7 +2991,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -3009,18 +3009,18 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop @@ -3038,7 +3038,7 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -3056,18 +3056,18 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop @@ -3085,7 +3085,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -3103,18 +3103,18 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop @@ -3132,7 +3132,7 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -3150,18 +3150,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop @@ -3179,7 +3179,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst ret i8 %new @@ -3197,18 +3197,18 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop @@ -3226,7 +3226,7 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB69_1; ; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -3244,18 +3244,18 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop @@ -3273,7 +3273,7 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB70_1; ; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -3291,18 +3291,18 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop @@ -3320,7 +3320,7 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB71_1; ; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -3338,18 +3338,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop @@ -3366,7 +3366,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -3384,18 +3384,18 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop @@ -3412,7 +3412,7 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3430,18 +3430,18 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop @@ -3458,7 +3458,7 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3476,18 +3476,18 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop @@ -3504,7 +3504,7 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3522,18 +3522,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop @@ -3550,7 +3550,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -3568,18 +3568,18 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop @@ -3596,7 +3596,7 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3614,18 +3614,18 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop @@ -3642,7 +3642,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3660,18 +3660,18 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop @@ -3688,7 +3688,7 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3706,18 +3706,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop @@ -3734,7 +3734,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic ret i8 %new @@ -3752,18 +3752,18 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop @@ -3780,7 +3780,7 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3798,18 +3798,18 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop @@ -3826,7 +3826,7 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3844,18 +3844,18 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop @@ -3872,7 +3872,7 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3890,18 +3890,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop @@ -3919,7 +3919,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB84_1; ; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -3937,18 +3937,18 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop @@ -3966,7 +3966,7 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -3984,18 +3984,18 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop @@ -4013,7 +4013,7 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -4031,18 +4031,18 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop @@ -4060,7 +4060,7 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4078,18 +4078,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop @@ -4107,7 +4107,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -4125,18 +4125,18 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop @@ -4154,7 +4154,7 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -4172,18 +4172,18 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop @@ -4201,7 +4201,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB90_1; ; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -4219,18 +4219,18 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop @@ -4248,7 +4248,7 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB91_1; ; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4266,18 +4266,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop @@ -4295,7 +4295,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB92_1; ; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire ret i8 %new @@ -4313,18 +4313,18 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop @@ -4342,7 +4342,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB93_1; ; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -4360,18 +4360,18 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop @@ -4389,7 +4389,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB94_1; ; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -4407,18 +4407,18 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop @@ -4436,7 +4436,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB95_1; ; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4454,18 +4454,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop @@ -4483,7 +4483,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB96_1; ; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -4501,18 +4501,18 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop @@ -4530,7 +4530,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB97_1; ; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4548,18 +4548,18 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop @@ -4577,7 +4577,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB98_1; ; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4595,18 +4595,18 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop @@ -4624,7 +4624,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB99_1; ; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -4642,18 +4642,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop @@ -4671,7 +4671,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB100_1; ; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -4689,18 +4689,18 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop @@ -4718,7 +4718,7 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB101_1; ; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4736,18 +4736,18 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop @@ -4765,7 +4765,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB102_1; ; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4783,18 +4783,18 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop @@ -4812,7 +4812,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB103_1; ; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -4830,18 +4830,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop @@ -4859,7 +4859,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB104_1; ; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst ret i8 %new @@ -4877,18 +4877,18 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop @@ -4906,7 +4906,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB105_1; ; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4924,18 +4924,18 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop @@ -4953,7 +4953,7 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB106_1; ; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4971,18 +4971,18 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop @@ -5000,7 +5000,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB107_1; ; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -5018,18 +5018,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop @@ -5047,7 +5047,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB108_1; ; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -5065,18 +5065,18 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop @@ -5094,7 +5094,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB109_1; ; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5112,18 +5112,18 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop @@ -5141,7 +5141,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB110_1; ; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5159,18 +5159,18 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop @@ -5188,7 +5188,7 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB111_1; ; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5206,18 +5206,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop @@ -5235,7 +5235,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB112_1; ; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -5253,18 +5253,18 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop @@ -5282,7 +5282,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB113_1; ; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5300,18 +5300,18 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop @@ -5329,7 +5329,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB114_1; ; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5347,18 +5347,18 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop @@ -5376,7 +5376,7 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB115_1; ; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5394,18 +5394,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop @@ -5423,7 +5423,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB116_1; ; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic ret i8 %new @@ -5441,18 +5441,18 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop @@ -5470,7 +5470,7 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB117_1; ; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5488,18 +5488,18 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop @@ -5517,7 +5517,7 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB118_1; ; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5535,18 +5535,18 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop @@ -5564,7 +5564,7 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB119_1; ; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5582,18 +5582,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop @@ -5611,7 +5611,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB120_1; ; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -5629,18 +5629,18 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop @@ -5658,7 +5658,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB121_1; ; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -5676,18 +5676,18 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop @@ -5705,7 +5705,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB122_1; ; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -5723,18 +5723,18 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop @@ -5752,7 +5752,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB123_1; ; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -5770,18 +5770,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop @@ -5799,7 +5799,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB124_1; ; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -5817,18 +5817,18 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop @@ -5846,7 +5846,7 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB125_1; ; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -5864,18 +5864,18 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop @@ -5893,7 +5893,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB126_1; ; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -5911,18 +5911,18 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop @@ -5940,7 +5940,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB127_1; ; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -5958,18 +5958,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop @@ -5987,7 +5987,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB128_1; ; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -6005,18 +6005,18 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop @@ -6034,7 +6034,7 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB129_1; ; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -6052,18 +6052,18 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop @@ -6081,7 +6081,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB130_1; ; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -6099,18 +6099,18 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop @@ -6128,7 +6128,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB131_1; ; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -6146,18 +6146,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop @@ -6175,7 +6175,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB132_1; ; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -6193,18 +6193,18 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop @@ -6222,7 +6222,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB133_1; ; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6240,18 +6240,18 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop @@ -6269,7 +6269,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB134_1; ; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6287,18 +6287,18 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop @@ -6316,7 +6316,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB135_1; ; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6334,18 +6334,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop @@ -6363,7 +6363,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB136_1; ; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -6381,18 +6381,18 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop @@ -6410,7 +6410,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB137_1; ; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6428,18 +6428,18 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop @@ -6457,7 +6457,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB138_1; ; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6475,18 +6475,18 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop @@ -6504,7 +6504,7 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB139_1; ; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6522,18 +6522,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop @@ -6551,7 +6551,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB140_1; ; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst ret i8 %new @@ -6569,18 +6569,18 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop @@ -6598,7 +6598,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB141_1; ; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6616,18 +6616,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop @@ -6645,7 +6645,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB142_1; ; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6663,18 +6663,18 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop @@ -6692,7 +6692,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB143_1; ; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6710,18 +6710,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop @@ -6739,7 +6739,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB144_1; ; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -6757,18 +6757,18 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop @@ -6786,7 +6786,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB145_1; ; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -6804,18 +6804,18 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop @@ -6833,7 +6833,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB146_1; ; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -6851,18 +6851,18 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop @@ -6880,7 +6880,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB147_1; ; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -6898,18 +6898,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop @@ -6927,7 +6927,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB148_1; ; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -6945,18 +6945,18 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop @@ -6974,7 +6974,7 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB149_1; ; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -6992,18 +6992,18 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop @@ -7021,7 +7021,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB150_1; ; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -7039,18 +7039,18 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop @@ -7068,7 +7068,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB151_1; ; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -7086,18 +7086,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop @@ -7115,7 +7115,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: @%p2 bra $L__BB152_1; ; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic ret i8 %new @@ -7133,18 +7133,18 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop @@ -7162,7 +7162,7 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB153_1; ; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -7180,18 +7180,18 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop @@ -7209,7 +7209,7 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB154_1; ; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -7227,18 +7227,18 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop @@ -7256,7 +7256,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: @%p2 bra $L__BB155_1; ; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -7274,18 +7274,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop @@ -7303,7 +7303,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB156_1; ; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -7321,18 +7321,18 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop @@ -7350,7 +7350,7 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB157_1; ; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7368,18 +7368,18 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop @@ -7397,7 +7397,7 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB158_1; ; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7415,18 +7415,18 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop @@ -7444,7 +7444,7 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB159_1; ; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7462,18 +7462,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop @@ -7491,7 +7491,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB160_1; ; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -7509,18 +7509,18 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop @@ -7538,7 +7538,7 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB161_1; ; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7556,18 +7556,18 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop @@ -7585,7 +7585,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB162_1; ; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7603,18 +7603,18 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop @@ -7632,7 +7632,7 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB163_1; ; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7650,18 +7650,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop @@ -7679,7 +7679,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB164_1; ; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire ret i8 %new @@ -7697,18 +7697,18 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop @@ -7726,7 +7726,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB165_1; ; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7744,18 +7744,18 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop @@ -7773,7 +7773,7 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB166_1; ; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7791,18 +7791,18 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop @@ -7820,7 +7820,7 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB167_1; ; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7838,18 +7838,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop @@ -7867,7 +7867,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB168_1; ; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -7885,18 +7885,18 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop @@ -7914,7 +7914,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB169_1; ; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -7932,18 +7932,18 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop @@ -7961,7 +7961,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB170_1; ; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -7979,18 +7979,18 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop @@ -8008,7 +8008,7 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB171_1; ; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -8026,18 +8026,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop @@ -8055,7 +8055,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB172_1; ; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -8073,18 +8073,18 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop @@ -8102,7 +8102,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB173_1; ; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -8120,18 +8120,18 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop @@ -8149,7 +8149,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB174_1; ; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -8167,18 +8167,18 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop @@ -8196,7 +8196,7 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB175_1; ; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -8214,18 +8214,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop @@ -8243,7 +8243,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: @%p2 bra $L__BB176_1; ; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst ret i8 %new @@ -8261,18 +8261,18 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM60-NEXT: membar.sys; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop @@ -8290,7 +8290,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB177_1; ; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -8308,18 +8308,18 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop @@ -8337,7 +8337,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB178_1; ; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -8355,18 +8355,18 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM60-NEXT: membar.gl; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r9, %rd2; -; SM60-NEXT: and.b32 %r10, %r9, 3; -; SM60-NEXT: shl.b32 %r1, %r10, 3; -; SM60-NEXT: mov.b32 %r11, 255; -; SM60-NEXT: shl.b32 %r12, %r11, %r1; -; SM60-NEXT: not.b32 %r2, %r12; -; SM60-NEXT: cvt.u32.u16 %r13, %rs1; -; SM60-NEXT: and.b32 %r14, %r13, 255; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: shl.b32 %r4, %r15, %r1; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; ; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop @@ -8384,7 +8384,7 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: @%p2 bra $L__BB179_1; ; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r13; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index ddedc7ea36252..c29bd0be840ba 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -176,7 +176,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -194,17 +194,17 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -221,7 +221,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -239,17 +239,17 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -266,7 +266,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -284,17 +284,17 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -311,7 +311,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -329,17 +329,17 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -356,7 +356,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -374,17 +374,17 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -401,7 +401,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -419,17 +419,17 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -447,7 +447,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -465,17 +465,17 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -493,7 +493,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -511,17 +511,17 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -539,7 +539,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -557,17 +557,17 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -585,7 +585,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -603,17 +603,17 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -631,7 +631,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -649,17 +649,17 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -677,7 +677,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -695,17 +695,17 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -723,7 +723,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -741,17 +741,17 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -769,7 +769,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -787,17 +787,17 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -815,7 +815,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -833,18 +833,18 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -862,7 +862,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -880,18 +880,18 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -909,7 +909,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -927,18 +927,18 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -956,7 +956,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -974,18 +974,18 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -1003,7 +1003,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1021,18 +1021,18 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1050,7 +1050,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1068,18 +1068,18 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1097,7 +1097,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1115,18 +1115,18 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1144,7 +1144,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1162,18 +1162,18 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1191,7 +1191,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1209,18 +1209,18 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1238,7 +1238,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1256,17 +1256,17 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1284,7 +1284,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1302,17 +1302,17 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1330,7 +1330,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1348,17 +1348,17 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1376,7 +1376,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -1394,17 +1394,17 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1422,7 +1422,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB30_1; ; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1440,17 +1440,17 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1468,7 +1468,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB31_1; ; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1486,17 +1486,17 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1514,7 +1514,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB32_1; ; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -1532,17 +1532,17 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1560,7 +1560,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB33_1; ; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1578,17 +1578,17 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1606,7 +1606,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB34_1; ; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1624,17 +1624,17 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1652,7 +1652,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB35_1; ; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -1670,17 +1670,17 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1698,7 +1698,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB36_1; ; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -1716,17 +1716,17 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1744,7 +1744,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB37_1; ; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -1762,17 +1762,17 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1790,7 +1790,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB38_1; ; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -1808,17 +1808,17 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1836,7 +1836,7 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB39_1; ; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -1854,17 +1854,17 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1882,7 +1882,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB40_1; ; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -1900,17 +1900,17 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1928,7 +1928,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB41_1; ; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -1946,17 +1946,17 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1974,7 +1974,7 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB42_1; ; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -1992,17 +1992,17 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2020,7 +2020,7 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB43_1; ; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2038,17 +2038,17 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2066,7 +2066,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB44_1; ; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2084,18 +2084,18 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop @@ -2113,7 +2113,7 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB45_1; ; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -2131,18 +2131,18 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop @@ -2160,7 +2160,7 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB46_1; ; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -2178,18 +2178,18 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop @@ -2207,7 +2207,7 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB47_1; ; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -2225,18 +2225,18 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop @@ -2254,7 +2254,7 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB48_1; ; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -2272,18 +2272,18 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop @@ -2301,7 +2301,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -2319,18 +2319,18 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop @@ -2348,7 +2348,7 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -2366,18 +2366,18 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop @@ -2395,7 +2395,7 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -2413,18 +2413,18 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop @@ -2442,7 +2442,7 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -2460,18 +2460,18 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop @@ -2489,7 +2489,7 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -2507,18 +2507,18 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop @@ -2535,7 +2535,7 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -2553,18 +2553,18 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop @@ -2581,7 +2581,7 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -2599,18 +2599,18 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop @@ -2627,7 +2627,7 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -2645,18 +2645,18 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop @@ -2673,7 +2673,7 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -2691,18 +2691,18 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop @@ -2719,7 +2719,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -2737,18 +2737,18 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop @@ -2765,7 +2765,7 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -2783,18 +2783,18 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop @@ -2811,7 +2811,7 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -2829,18 +2829,18 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop @@ -2857,7 +2857,7 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -2875,18 +2875,18 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop @@ -2903,7 +2903,7 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -2921,18 +2921,18 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop @@ -2950,7 +2950,7 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -2968,18 +2968,18 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop @@ -2997,7 +2997,7 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -3015,18 +3015,18 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop @@ -3044,7 +3044,7 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -3062,18 +3062,18 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop @@ -3091,7 +3091,7 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -3109,18 +3109,18 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop @@ -3138,7 +3138,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -3156,18 +3156,18 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop @@ -3185,7 +3185,7 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -3203,18 +3203,18 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop @@ -3232,7 +3232,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB69_1; ; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -3250,18 +3250,18 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop @@ -3279,7 +3279,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB70_1; ; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -3297,18 +3297,18 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop @@ -3326,7 +3326,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB71_1; ; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -3344,18 +3344,18 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop @@ -3373,7 +3373,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -3391,18 +3391,18 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop @@ -3420,7 +3420,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -3438,18 +3438,18 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop @@ -3467,7 +3467,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -3485,18 +3485,18 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop @@ -3514,7 +3514,7 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -3532,18 +3532,18 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop @@ -3561,7 +3561,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -3579,18 +3579,18 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop @@ -3608,7 +3608,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -3626,18 +3626,18 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop @@ -3655,7 +3655,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -3673,18 +3673,18 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop @@ -3702,7 +3702,7 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -3720,18 +3720,18 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop @@ -3749,7 +3749,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -3767,18 +3767,18 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop @@ -3796,7 +3796,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -3814,18 +3814,18 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop @@ -3843,7 +3843,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -3861,18 +3861,18 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop @@ -3890,7 +3890,7 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -3908,18 +3908,18 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop @@ -3937,7 +3937,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB84_1; ; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -3955,18 +3955,18 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop @@ -3984,7 +3984,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -4002,18 +4002,18 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop @@ -4031,7 +4031,7 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -4049,18 +4049,18 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop @@ -4078,7 +4078,7 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -4096,18 +4096,18 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop @@ -4125,7 +4125,7 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -4143,18 +4143,18 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop @@ -4172,7 +4172,7 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -4190,18 +4190,18 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop @@ -4219,7 +4219,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB90_1; ; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -4237,18 +4237,18 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop @@ -4266,7 +4266,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB91_1; ; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -4284,18 +4284,18 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop @@ -4313,7 +4313,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB92_1; ; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -4331,18 +4331,18 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop @@ -4360,7 +4360,7 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB93_1; ; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -4378,18 +4378,18 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop @@ -4407,7 +4407,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB94_1; ; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -4425,18 +4425,18 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop @@ -4454,7 +4454,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB95_1; ; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -4472,18 +4472,18 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop @@ -4501,7 +4501,7 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB96_1; ; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -4519,18 +4519,18 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop @@ -4548,7 +4548,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB97_1; ; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -4566,18 +4566,18 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop @@ -4595,7 +4595,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB98_1; ; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -4613,18 +4613,18 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop @@ -4642,7 +4642,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB99_1; ; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -4660,18 +4660,18 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop @@ -4689,7 +4689,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB100_1; ; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -4707,18 +4707,18 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop @@ -4736,7 +4736,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB101_1; ; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -4754,18 +4754,18 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop @@ -4783,7 +4783,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB102_1; ; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -4801,18 +4801,18 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop @@ -4830,7 +4830,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB103_1; ; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -4848,18 +4848,18 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop @@ -4877,7 +4877,7 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB104_1; ; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -4895,18 +4895,18 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop @@ -4924,7 +4924,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB105_1; ; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -4942,18 +4942,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop @@ -4971,7 +4971,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB106_1; ; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -4989,18 +4989,18 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop @@ -5018,7 +5018,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB107_1; ; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -5036,18 +5036,18 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop @@ -5065,7 +5065,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB108_1; ; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -5083,18 +5083,18 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop @@ -5112,7 +5112,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB109_1; ; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -5130,18 +5130,18 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop @@ -5159,7 +5159,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB110_1; ; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -5177,18 +5177,18 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop @@ -5206,7 +5206,7 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB111_1; ; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -5224,18 +5224,18 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop @@ -5253,7 +5253,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB112_1; ; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -5271,18 +5271,18 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop @@ -5300,7 +5300,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB113_1; ; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -5318,18 +5318,18 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop @@ -5347,7 +5347,7 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB114_1; ; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -5365,18 +5365,18 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop @@ -5394,7 +5394,7 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB115_1; ; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -5412,18 +5412,18 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop @@ -5441,7 +5441,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: @%p2 bra $L__BB116_1; ; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -5459,18 +5459,18 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop @@ -5488,7 +5488,7 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB117_1; ; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -5506,18 +5506,18 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop @@ -5535,7 +5535,7 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB118_1; ; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -5553,18 +5553,18 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop @@ -5582,7 +5582,7 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB119_1; ; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -5600,18 +5600,18 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop @@ -5629,7 +5629,7 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB120_1; ; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -5647,18 +5647,18 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop @@ -5676,7 +5676,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB121_1; ; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -5694,18 +5694,18 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop @@ -5723,7 +5723,7 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB122_1; ; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -5741,18 +5741,18 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop @@ -5770,7 +5770,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB123_1; ; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -5788,18 +5788,18 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop @@ -5817,7 +5817,7 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB124_1; ; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -5835,18 +5835,18 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop @@ -5864,7 +5864,7 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB125_1; ; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -5882,18 +5882,18 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop @@ -5911,7 +5911,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB126_1; ; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -5929,18 +5929,18 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop @@ -5958,7 +5958,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB127_1; ; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -5976,18 +5976,18 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop @@ -6005,7 +6005,7 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB128_1; ; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -6023,18 +6023,18 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop @@ -6052,7 +6052,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB129_1; ; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -6070,18 +6070,18 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop @@ -6099,7 +6099,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB130_1; ; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -6117,18 +6117,18 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop @@ -6146,7 +6146,7 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB131_1; ; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -6164,18 +6164,18 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop @@ -6193,7 +6193,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB132_1; ; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -6211,18 +6211,18 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop @@ -6240,7 +6240,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB133_1; ; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -6258,18 +6258,18 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop @@ -6287,7 +6287,7 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: @%p2 bra $L__BB134_1; ; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 68658255ad5af..a45c95fccf0cb 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new @@ -149,17 +149,17 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -176,7 +176,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -194,17 +194,17 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -221,7 +221,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -239,17 +239,17 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop @@ -266,7 +266,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -284,17 +284,17 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop @@ -311,7 +311,7 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new @@ -329,17 +329,17 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop @@ -356,7 +356,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -374,17 +374,17 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop @@ -401,7 +401,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic ret i8 %new @@ -419,17 +419,17 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop @@ -446,7 +446,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -464,17 +464,17 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop @@ -491,7 +491,7 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic ret i8 %new @@ -509,17 +509,17 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop @@ -536,7 +536,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic ret i8 %new @@ -554,17 +554,17 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop @@ -582,7 +582,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -600,17 +600,17 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop @@ -628,7 +628,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -646,17 +646,17 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop @@ -674,7 +674,7 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new @@ -692,17 +692,17 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop @@ -720,7 +720,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -738,17 +738,17 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop @@ -766,7 +766,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -784,17 +784,17 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop @@ -812,7 +812,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -830,17 +830,17 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop @@ -858,7 +858,7 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new @@ -876,17 +876,17 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop @@ -904,7 +904,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -922,17 +922,17 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop @@ -950,7 +950,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire ret i8 %new @@ -968,17 +968,17 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop @@ -996,7 +996,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -1014,17 +1014,17 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop @@ -1042,7 +1042,7 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire ret i8 %new @@ -1060,17 +1060,17 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop @@ -1088,7 +1088,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire ret i8 %new @@ -1106,18 +1106,18 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop @@ -1135,7 +1135,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1153,18 +1153,18 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop @@ -1182,7 +1182,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1200,18 +1200,18 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop @@ -1229,7 +1229,7 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new @@ -1247,18 +1247,18 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop @@ -1276,7 +1276,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1294,18 +1294,18 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop @@ -1323,7 +1323,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1341,18 +1341,18 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop @@ -1370,7 +1370,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1388,18 +1388,18 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop @@ -1417,7 +1417,7 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB30_1; ; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new @@ -1435,18 +1435,18 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop @@ -1464,7 +1464,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB31_1; ; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1482,18 +1482,18 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop @@ -1511,7 +1511,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB32_1; ; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst ret i8 %new @@ -1529,18 +1529,18 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop @@ -1558,7 +1558,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB33_1; ; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -1576,18 +1576,18 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop @@ -1605,7 +1605,7 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB34_1; ; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst ret i8 %new @@ -1623,18 +1623,18 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop @@ -1652,7 +1652,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB35_1; ; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst ret i8 %new @@ -1670,17 +1670,17 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop @@ -1698,7 +1698,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB36_1; ; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1716,17 +1716,17 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop @@ -1744,7 +1744,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB37_1; ; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1762,17 +1762,17 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop @@ -1790,7 +1790,7 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB38_1; ; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new @@ -1808,17 +1808,17 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop @@ -1836,7 +1836,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB39_1; ; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -1854,17 +1854,17 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop @@ -1882,7 +1882,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB40_1; ; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -1900,17 +1900,17 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop @@ -1928,7 +1928,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB41_1; ; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -1946,17 +1946,17 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop @@ -1974,7 +1974,7 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB42_1; ; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new @@ -1992,17 +1992,17 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop @@ -2020,7 +2020,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB43_1; ; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -2038,17 +2038,17 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop @@ -2066,7 +2066,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB44_1; ; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic ret i8 %new @@ -2084,17 +2084,17 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop @@ -2112,7 +2112,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -2130,17 +2130,17 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop @@ -2158,7 +2158,7 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic ret i8 %new @@ -2176,17 +2176,17 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop @@ -2204,7 +2204,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic ret i8 %new @@ -2222,17 +2222,17 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop @@ -2250,7 +2250,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2268,17 +2268,17 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop @@ -2296,7 +2296,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2314,17 +2314,17 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop @@ -2342,7 +2342,7 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new @@ -2360,17 +2360,17 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop @@ -2388,7 +2388,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2406,17 +2406,17 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop @@ -2434,7 +2434,7 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2452,17 +2452,17 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop @@ -2480,7 +2480,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2498,17 +2498,17 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop @@ -2526,7 +2526,7 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new @@ -2544,17 +2544,17 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop @@ -2572,7 +2572,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2590,17 +2590,17 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop @@ -2618,7 +2618,7 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB56_1; ; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire ret i8 %new @@ -2636,17 +2636,17 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop @@ -2664,7 +2664,7 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB57_1; ; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -2682,17 +2682,17 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cluster_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop @@ -2710,7 +2710,7 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB58_1; ; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire ret i8 %new @@ -2728,17 +2728,17 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop @@ -2756,7 +2756,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB59_1; ; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire ret i8 %new @@ -2774,18 +2774,18 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop @@ -2803,7 +2803,7 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -2821,18 +2821,18 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop @@ -2850,7 +2850,7 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -2868,18 +2868,18 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop @@ -2897,7 +2897,7 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new @@ -2915,18 +2915,18 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop @@ -2944,7 +2944,7 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -2962,18 +2962,18 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop @@ -2991,7 +2991,7 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -3009,18 +3009,18 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop @@ -3038,7 +3038,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -3056,18 +3056,18 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop @@ -3085,7 +3085,7 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new @@ -3103,18 +3103,18 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop @@ -3132,7 +3132,7 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -3150,18 +3150,18 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop @@ -3179,7 +3179,7 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst ret i8 %new @@ -3197,18 +3197,18 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop @@ -3226,7 +3226,7 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -3244,18 +3244,18 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop @@ -3273,7 +3273,7 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst ret i8 %new @@ -3291,18 +3291,18 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop @@ -3320,7 +3320,7 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst ret i8 %new @@ -3338,18 +3338,18 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop @@ -3366,7 +3366,7 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3384,18 +3384,18 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop @@ -3412,7 +3412,7 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3430,18 +3430,18 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop @@ -3458,7 +3458,7 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new @@ -3476,18 +3476,18 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop @@ -3504,7 +3504,7 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3522,18 +3522,18 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop @@ -3550,7 +3550,7 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3568,18 +3568,18 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop @@ -3596,7 +3596,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3614,18 +3614,18 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop @@ -3642,7 +3642,7 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new @@ -3660,18 +3660,18 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop @@ -3688,7 +3688,7 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3706,18 +3706,18 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop @@ -3734,7 +3734,7 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic ret i8 %new @@ -3752,18 +3752,18 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop @@ -3780,7 +3780,7 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -3798,18 +3798,18 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop @@ -3826,7 +3826,7 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic ret i8 %new @@ -3844,18 +3844,18 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop @@ -3872,7 +3872,7 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic ret i8 %new @@ -3890,18 +3890,18 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop @@ -3919,7 +3919,7 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -3937,18 +3937,18 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop @@ -3966,7 +3966,7 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -3984,18 +3984,18 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop @@ -4013,7 +4013,7 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new @@ -4031,18 +4031,18 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop @@ -4060,7 +4060,7 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4078,18 +4078,18 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop @@ -4107,7 +4107,7 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -4125,18 +4125,18 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop @@ -4154,7 +4154,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -4172,18 +4172,18 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop @@ -4201,7 +4201,7 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB90_1; ; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new @@ -4219,18 +4219,18 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop @@ -4248,7 +4248,7 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB91_1; ; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4266,18 +4266,18 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop @@ -4295,7 +4295,7 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB92_1; ; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire ret i8 %new @@ -4313,18 +4313,18 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop @@ -4342,7 +4342,7 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB93_1; ; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -4360,18 +4360,18 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop @@ -4389,7 +4389,7 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB94_1; ; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire ret i8 %new @@ -4407,18 +4407,18 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop @@ -4436,7 +4436,7 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB95_1; ; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire ret i8 %new @@ -4454,18 +4454,18 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop @@ -4483,7 +4483,7 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB96_1; ; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4501,18 +4501,18 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop @@ -4530,7 +4530,7 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB97_1; ; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4548,18 +4548,18 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop @@ -4577,7 +4577,7 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB98_1; ; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new @@ -4595,18 +4595,18 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop @@ -4624,7 +4624,7 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB99_1; ; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -4642,18 +4642,18 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop @@ -4671,7 +4671,7 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB100_1; ; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4689,18 +4689,18 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop @@ -4718,7 +4718,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB101_1; ; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4736,18 +4736,18 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop @@ -4765,7 +4765,7 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB102_1; ; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new @@ -4783,18 +4783,18 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop @@ -4812,7 +4812,7 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB103_1; ; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -4830,18 +4830,18 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop @@ -4859,7 +4859,7 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB104_1; ; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst ret i8 %new @@ -4877,18 +4877,18 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop @@ -4906,7 +4906,7 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB105_1; ; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -4924,18 +4924,18 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop @@ -4953,7 +4953,7 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB106_1; ; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst ret i8 %new @@ -4971,18 +4971,18 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop @@ -5000,7 +5000,7 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB107_1; ; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst ret i8 %new @@ -5018,18 +5018,18 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop @@ -5047,7 +5047,7 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB108_1; ; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5065,18 +5065,18 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop @@ -5094,7 +5094,7 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB109_1; ; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5112,18 +5112,18 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop @@ -5141,7 +5141,7 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB110_1; ; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new @@ -5159,18 +5159,18 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop @@ -5188,7 +5188,7 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB111_1; ; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5206,18 +5206,18 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop @@ -5235,7 +5235,7 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB112_1; ; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5253,18 +5253,18 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop @@ -5282,7 +5282,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB113_1; ; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5300,18 +5300,18 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop @@ -5329,7 +5329,7 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB114_1; ; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new @@ -5347,18 +5347,18 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop @@ -5376,7 +5376,7 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB115_1; ; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5394,18 +5394,18 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop @@ -5423,7 +5423,7 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB116_1; ; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic ret i8 %new @@ -5441,18 +5441,18 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop @@ -5470,7 +5470,7 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB117_1; ; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -5488,18 +5488,18 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop @@ -5517,7 +5517,7 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB118_1; ; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic ret i8 %new @@ -5535,18 +5535,18 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop @@ -5564,7 +5564,7 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB119_1; ; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic ret i8 %new @@ -5582,18 +5582,18 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop @@ -5611,7 +5611,7 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB120_1; ; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -5629,18 +5629,18 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop @@ -5658,7 +5658,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB121_1; ; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -5676,18 +5676,18 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop @@ -5705,7 +5705,7 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB122_1; ; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new @@ -5723,18 +5723,18 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop @@ -5752,7 +5752,7 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB123_1; ; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -5770,18 +5770,18 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop @@ -5799,7 +5799,7 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB124_1; ; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -5817,18 +5817,18 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop @@ -5846,7 +5846,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB125_1; ; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -5864,18 +5864,18 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop @@ -5893,7 +5893,7 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB126_1; ; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new @@ -5911,18 +5911,18 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop @@ -5940,7 +5940,7 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB127_1; ; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -5958,18 +5958,18 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop @@ -5987,7 +5987,7 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB128_1; ; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire ret i8 %new @@ -6005,18 +6005,18 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop @@ -6034,7 +6034,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB129_1; ; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -6052,18 +6052,18 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop @@ -6081,7 +6081,7 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB130_1; ; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire ret i8 %new @@ -6099,18 +6099,18 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop @@ -6128,7 +6128,7 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB131_1; ; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire ret i8 %new @@ -6146,18 +6146,18 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop @@ -6175,7 +6175,7 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB132_1; ; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6193,18 +6193,18 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop @@ -6222,7 +6222,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB133_1; ; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6240,18 +6240,18 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop @@ -6269,7 +6269,7 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB134_1; ; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new @@ -6287,18 +6287,18 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop @@ -6316,7 +6316,7 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB135_1; ; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6334,18 +6334,18 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop @@ -6363,7 +6363,7 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB136_1; ; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6381,18 +6381,18 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop @@ -6410,7 +6410,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB137_1; ; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6428,18 +6428,18 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop @@ -6457,7 +6457,7 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB138_1; ; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new @@ -6475,18 +6475,18 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop @@ -6504,7 +6504,7 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB139_1; ; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6522,18 +6522,18 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop @@ -6551,7 +6551,7 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB140_1; ; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst ret i8 %new @@ -6569,18 +6569,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop @@ -6598,7 +6598,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB141_1; ; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -6616,18 +6616,18 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop @@ -6645,7 +6645,7 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB142_1; ; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst ret i8 %new @@ -6663,18 +6663,18 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop @@ -6692,7 +6692,7 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB143_1; ; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst ret i8 %new @@ -6710,18 +6710,18 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop @@ -6739,7 +6739,7 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB144_1; ; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -6757,18 +6757,18 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop @@ -6786,7 +6786,7 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB145_1; ; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -6804,18 +6804,18 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop @@ -6833,7 +6833,7 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB146_1; ; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new @@ -6851,18 +6851,18 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop @@ -6880,7 +6880,7 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB147_1; ; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -6898,18 +6898,18 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop @@ -6927,7 +6927,7 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB148_1; ; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -6945,18 +6945,18 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop @@ -6974,7 +6974,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB149_1; ; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -6992,18 +6992,18 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop @@ -7021,7 +7021,7 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB150_1; ; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new @@ -7039,18 +7039,18 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop @@ -7068,7 +7068,7 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB151_1; ; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -7086,18 +7086,18 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop @@ -7115,7 +7115,7 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB152_1; ; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic ret i8 %new @@ -7133,18 +7133,18 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop @@ -7162,7 +7162,7 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB153_1; ; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -7180,18 +7180,18 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop @@ -7209,7 +7209,7 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: @%p2 bra $L__BB154_1; ; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic ret i8 %new @@ -7227,18 +7227,18 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop @@ -7256,7 +7256,7 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: @%p2 bra $L__BB155_1; ; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic ret i8 %new @@ -7274,18 +7274,18 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop @@ -7303,7 +7303,7 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB156_1; ; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7321,18 +7321,18 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop @@ -7350,7 +7350,7 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB157_1; ; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7368,18 +7368,18 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop @@ -7397,7 +7397,7 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB158_1; ; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new @@ -7415,18 +7415,18 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop @@ -7444,7 +7444,7 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB159_1; ; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7462,18 +7462,18 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop @@ -7491,7 +7491,7 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB160_1; ; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7509,18 +7509,18 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop @@ -7538,7 +7538,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB161_1; ; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7556,18 +7556,18 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop @@ -7585,7 +7585,7 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB162_1; ; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new @@ -7603,18 +7603,18 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop @@ -7632,7 +7632,7 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB163_1; ; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7650,18 +7650,18 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop @@ -7679,7 +7679,7 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB164_1; ; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire ret i8 %new @@ -7697,18 +7697,18 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop @@ -7726,7 +7726,7 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB165_1; ; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -7744,18 +7744,18 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop @@ -7773,7 +7773,7 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB166_1; ; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire ret i8 %new @@ -7791,18 +7791,18 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop @@ -7820,7 +7820,7 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB167_1; ; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire ret i8 %new @@ -7838,18 +7838,18 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop @@ -7867,7 +7867,7 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB168_1; ; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -7885,18 +7885,18 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop @@ -7914,7 +7914,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB169_1; ; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -7932,18 +7932,18 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop @@ -7961,7 +7961,7 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB170_1; ; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new @@ -7979,18 +7979,18 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop @@ -8008,7 +8008,7 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: @%p2 bra $L__BB171_1; ; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -8026,18 +8026,18 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop @@ -8055,7 +8055,7 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB172_1; ; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -8073,18 +8073,18 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop @@ -8102,7 +8102,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB173_1; ; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -8120,18 +8120,18 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop @@ -8149,7 +8149,7 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB174_1; ; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new @@ -8167,18 +8167,18 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop @@ -8196,7 +8196,7 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB175_1; ; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new @@ -8214,18 +8214,18 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; ; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop @@ -8243,7 +8243,7 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB176_1; ; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst ret i8 %new @@ -8261,18 +8261,18 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; ; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop @@ -8290,7 +8290,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB177_1; ; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -8308,18 +8308,18 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; ; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cluster_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop @@ -8337,7 +8337,7 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: @%p2 bra $L__BB178_1; ; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst ret i8 %new @@ -8355,18 +8355,18 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; ; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r9, %rd2; -; SM90-NEXT: and.b32 %r10, %r9, 3; -; SM90-NEXT: shl.b32 %r1, %r10, 3; -; SM90-NEXT: mov.b32 %r11, 255; -; SM90-NEXT: shl.b32 %r12, %r11, %r1; -; SM90-NEXT: not.b32 %r2, %r12; -; SM90-NEXT: cvt.u32.u16 %r13, %rs1; -; SM90-NEXT: and.b32 %r14, %r13, 255; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: shl.b32 %r4, %r15, %r1; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; ; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop @@ -8384,7 +8384,7 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: @%p2 bra $L__BB179_1; ; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r13; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index bff37d4cbba87..e087fcfe87917 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -21,17 +21,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -48,7 +48,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( @@ -62,17 +62,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop @@ -89,7 +89,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i8( ; SM90: { @@ -147,17 +147,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -175,7 +175,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i8( @@ -189,17 +189,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop @@ -217,7 +217,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i8( ; SM90: { @@ -276,18 +276,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -304,7 +304,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: mov.b32 %r20, %r8; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i8( @@ -318,18 +318,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop @@ -346,7 +346,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i8( ; SM90: { @@ -405,18 +405,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -434,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i8( @@ -448,18 +448,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop @@ -477,7 +477,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i8( ; SM90: { @@ -537,18 +537,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; +; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r9, %rd2; -; SM30-NEXT: and.b32 %r10, %r9, 3; -; SM30-NEXT: shl.b32 %r1, %r10, 3; -; SM30-NEXT: mov.b32 %r11, 255; -; SM30-NEXT: shl.b32 %r12, %r11, %r1; -; SM30-NEXT: not.b32 %r2, %r12; -; SM30-NEXT: cvt.u32.u16 %r13, %rs1; -; SM30-NEXT: and.b32 %r14, %r13, 255; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; -; SM30-NEXT: shl.b32 %r4, %r15, %r1; +; SM30-NEXT: cvt.u32.u64 %r10, %rd2; +; SM30-NEXT: and.b32 %r11, %r10, 3; +; SM30-NEXT: shl.b32 %r1, %r11, 3; +; SM30-NEXT: mov.b32 %r12, 255; +; SM30-NEXT: shl.b32 %r13, %r12, %r1; +; SM30-NEXT: not.b32 %r2, %r13; +; SM30-NEXT: cvt.u32.u16 %r14, %rs1; +; SM30-NEXT: and.b32 %r15, %r14, 255; +; SM30-NEXT: shl.b32 %r3, %r15, %r1; +; SM30-NEXT: shl.b32 %r4, %r9, %r1; ; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -566,7 +566,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r13; +; SM30-NEXT: st.param.b32 [func_retval0], %r14; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i8( @@ -580,18 +580,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r9, %rd2; -; SM70-NEXT: and.b32 %r10, %r9, 3; -; SM70-NEXT: shl.b32 %r1, %r10, 3; -; SM70-NEXT: mov.b32 %r11, 255; -; SM70-NEXT: shl.b32 %r12, %r11, %r1; -; SM70-NEXT: not.b32 %r2, %r12; -; SM70-NEXT: cvt.u32.u16 %r13, %rs1; -; SM70-NEXT: and.b32 %r14, %r13, 255; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; -; SM70-NEXT: shl.b32 %r4, %r15, %r1; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; ; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop @@ -609,7 +609,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r13; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i8( ; SM90: { From acd31beb72152225d11a94a795e18f71febc81b8 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 8 Jul 2025 22:40:46 +0000 Subject: [PATCH 20/26] address review comments --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 23 +++++------ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 8 ++-- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 44 ++++++++++----------- 4 files changed, 35 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 519fced023e12..1f165ff119246 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -487,7 +487,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { return true; } -static std::optional convertAS(unsigned AS) { +static std::optional convertAS(unsigned AS) { switch (AS) { case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::AddressSpace::Local; @@ -508,17 +508,12 @@ static std::optional convertAS(unsigned AS) { } } -static unsigned int getCodeAddrSpace(const MemSDNode *N) { +NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) { return convertAS(N->getMemOperand()->getAddrSpace()) .value_or(NVPTX::AddressSpace::Generic); } -unsigned int NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) const { - return convertAS(N->getMemOperand()->getAddrSpace()) - .value_or(NVPTX::AddressSpace::Generic); -} - -unsigned int NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { +NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { // No "sem" orderings for SM/PTX versions which do not support memory ordering if (!Subtarget->hasMemoryOrdering()) return NVPTX::Ordering::NotAtomic; @@ -540,7 +535,7 @@ unsigned int NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { } } -unsigned int NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { +NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { // No "scope" modifier for SM/PTX versions which do not support scoped atomics if (!Subtarget->hasAtomScope()) return NVPTX::Scope::Thread; @@ -559,7 +554,7 @@ struct OperationOrderings { static OperationOrderings getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); - auto CodeAddrSpace = getCodeAddrSpace(N); + auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); @@ -1051,7 +1046,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { const MVT LoadedVT = LoadedEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const unsigned CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1123,7 +1118,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const unsigned CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1339,7 +1334,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const unsigned CodeAddrSpace = getAddrSpace(ST); SDLoc DL(ST); SDValue Chain = ST->getChain(); @@ -1389,7 +1384,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { assert(StoreVT.isSimple() && "Store value is not simple"); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const unsigned CodeAddrSpace = getAddrSpace(ST); if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 2228f292b3e2a..18aa7a3acc9e3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -102,9 +102,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } - unsigned int getAddrSpace(const MemSDNode *N) const; - unsigned int getMemOrder(const MemSDNode *N) const; - unsigned int getAtomicScope(const MemSDNode *N) const; + NVPTX::Ordering getMemOrder(const MemSDNode *N) const; + NVPTX::Scope getAtomicScope(const MemSDNode *N) const; bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset); SDValue getPTXCmpMode(const CondCodeSDNode &CondCode); @@ -119,6 +118,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { std::pair insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N); NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const; + +public: + static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8d6fc11a4f04d..9fda617aa9c92 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6275,7 +6275,7 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, SyncScope::ID SSID = cast(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent - ? AtomicOrdering::SequentiallyConsistent + ? Ord : AtomicOrdering::Release, SSID); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 7faa13a8571d6..5e48a720a79ab 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1881,29 +1881,27 @@ multiclass F_ATOMIC_2 { - defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str # "\t$dst, [$addr], $b, $c;"; +multiclass F_ATOMIC_3 { + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def _rr : NVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ir : NVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ri : NVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; - def _ii : NVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), asm_str, []>; } -} -multiclass F_ATOMIC_3_PATTERN { defvar GetSem = SDNodeXForm(N)), SDLoc(N)); }]>; @@ -1917,16 +1915,16 @@ multiclass F_ATOMIC_3_PATTERN; def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c), - (!cast(InstructionName#_rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c), - (!cast(InstructionName#_ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)), - (!cast(InstructionName#_#ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _#ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)), - (!cast(InstructionName#_#ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _#ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; } multiclass F_ATOMIC_2_AS preds = []> { @@ -1984,9 +1982,7 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS("atomic_cmp_swap_i"#t.Size); defm INT_PTX_ATOM_CAS_#t.Size - : F_ATOMIC_3; - - defm INT_PTX_ATOM_CAS_PAT_#t.Size : F_ATOMIC_3_PATTERN; + : F_ATOMIC_3; } // Support for scoped atomic operations. Matches @@ -2027,10 +2023,10 @@ multiclass ATOM2S_impl { +multiclass F_ATOMIC_3_INTRINSIC_PATTERN { foreach scope = ["cta", "sys"] in { foreach space = ["gen"] in { - defvar intrinsic = !cast("int_nvvm_atomic_" # OpStr # "_" # space # "_" # IntTypeStr # "_" # scope); + defvar intrinsic = !cast("int_nvvm_atomic_" # OpStr # "_" # space # "_i_" # scope); def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), (!cast(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast("Scope_" # scope), !cast("AddrSpace_" # space))>; @@ -2084,9 +2080,9 @@ multiclass ATOM2_incdec_impl { // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN; - defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN; - defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN; + defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; From 4fb578468547b14bf44221e07a56d4176ffd00aa Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Thu, 10 Jul 2025 01:15:25 +0000 Subject: [PATCH 21/26] update tests, address review comments --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 +- llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 8 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 1710 ++++++++--------- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 1352 ++++++------- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 1710 ++++++++--------- llvm/test/CodeGen/NVPTX/cmpxchg.ll | 20 +- .../NVPTX/distributed-shared-cluster.ll | 10 +- 8 files changed, 2415 insertions(+), 2415 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 5e48a720a79ab..d6dd076340019 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1887,19 +1887,19 @@ multiclass F_ATOMIC_3; + asm_str>; def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), - asm_str, []>; + asm_str>; def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), - asm_str, []>; + asm_str>; def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), - asm_str, []>; + asm_str>; } defvar GetSem = SDNodeXForm(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)), - (!cast(NAME # _#ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)), - (!cast(NAME # _#ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + (!cast(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; } multiclass F_ATOMIC_2_AS preds = []> { diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index 94f49b01e6ea6..f710d7f883a1b 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -70,7 +70,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX62-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r6, %r54; ; CHECKPTX62-NEXT: mov.b32 %r54, %r6; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; @@ -86,7 +86,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r9, %r55; ; CHECKPTX62-NEXT: mov.b32 %r55, %r9; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; @@ -107,7 +107,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r15, %r56; ; CHECKPTX62-NEXT: mov.b32 %r56, %r15; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; @@ -128,7 +128,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX62-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r21, %r57; ; CHECKPTX62-NEXT: mov.b32 %r57, %r21; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index b21bd16d55c2c..f96fd30019025 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; ; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r6, %r54; ; CHECKPTX71-NEXT: mov.b32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; @@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; ; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r9, %r55; ; CHECKPTX71-NEXT: mov.b32 %r55, %r9; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; ; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r15, %r56; ; CHECKPTX71-NEXT: mov.b32 %r56, %r15; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; @@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; ; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r21, %r57; ; CHECKPTX71-NEXT: mov.b32 %r57, %r21; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 2371af07a151b..655ee851c4083 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -121,7 +121,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -166,7 +166,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -211,7 +211,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -256,7 +256,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -301,7 +301,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -346,7 +346,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -391,7 +391,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -436,7 +436,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -481,7 +481,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -526,7 +526,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -571,7 +571,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -617,7 +617,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -663,7 +663,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -709,7 +709,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -755,7 +755,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -801,7 +801,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -847,7 +847,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -893,7 +893,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -939,7 +939,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -985,7 +985,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1031,7 +1031,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1077,7 +1077,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1124,7 +1124,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1171,7 +1171,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1218,7 +1218,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1265,7 +1265,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB27_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1312,7 +1312,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1359,7 +1359,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1406,7 +1406,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB30_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1453,7 +1453,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB31_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1500,7 +1500,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB32_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1547,7 +1547,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB33_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1594,7 +1594,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB34_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1641,7 +1641,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB35_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1687,7 +1687,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB36_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1733,7 +1733,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB37_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1779,7 +1779,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB38_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1825,7 +1825,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB39_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1871,7 +1871,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB40_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1917,7 +1917,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB41_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1963,7 +1963,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB42_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2009,7 +2009,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB43_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2055,7 +2055,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB44_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2099,16 +2099,16 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB45_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB45_1; ; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2145,16 +2145,16 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB46_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB46_1; ; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2191,16 +2191,16 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB47_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB47_1; ; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -2237,16 +2237,16 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB48_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB48_1; ; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2283,16 +2283,16 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB49_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB49_1; ; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2329,16 +2329,16 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB50_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB50_1; ; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2375,16 +2375,16 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB51_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB51_1; ; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -2421,16 +2421,16 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB52_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB52_1; ; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2467,16 +2467,16 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB53_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB53_1; ; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2513,16 +2513,16 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB54_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB54_1; ; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2559,16 +2559,16 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB55_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB55_1; ; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -2605,16 +2605,16 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB56_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB56_1; ; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2651,16 +2651,16 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB57_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB57_1; ; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2697,16 +2697,16 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB58_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB58_1; ; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2743,16 +2743,16 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB59_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB59_1; ; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -2790,16 +2790,16 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2837,16 +2837,16 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB61_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB61_1; ; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -2884,16 +2884,16 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB62_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB62_1; ; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2931,16 +2931,16 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB63_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB63_1; ; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -2978,16 +2978,16 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -3025,16 +3025,16 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -3072,16 +3072,16 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB66_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB66_1; ; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -3119,16 +3119,16 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB67_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB67_1; ; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -3166,16 +3166,16 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB68_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB68_1; ; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -3213,16 +3213,16 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB69_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB69_1; ; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -3260,16 +3260,16 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB70_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB70_1; ; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -3307,16 +3307,16 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB71_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB71_1; ; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -3354,16 +3354,16 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB72_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB72_1; ; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3400,16 +3400,16 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB73_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB73_1; ; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3446,16 +3446,16 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB74_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB74_1; ; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3492,16 +3492,16 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB75_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB75_1; ; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3538,16 +3538,16 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB76_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB76_1; ; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3584,16 +3584,16 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB77_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB77_1; ; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3630,16 +3630,16 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB78_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB78_1; ; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3676,16 +3676,16 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB79_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB79_1; ; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3722,16 +3722,16 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB80_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB80_1; ; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3768,16 +3768,16 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB81_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB81_1; ; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3814,16 +3814,16 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB82_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB82_1; ; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3860,16 +3860,16 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB83_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB83_1; ; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; @@ -3906,16 +3906,16 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB84_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB84_1; ; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -3953,16 +3953,16 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB85_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB85_1; ; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -4000,16 +4000,16 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB86_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB86_1; ; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -4047,16 +4047,16 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB87_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB87_1; ; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.gl; @@ -4094,16 +4094,16 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB88_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB88_1; ; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -4141,16 +4141,16 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB89_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB89_1; ; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -4191,12 +4191,12 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB90_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB90_1; ; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end @@ -4238,12 +4238,12 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB91_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB91_1; ; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end @@ -4285,12 +4285,12 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB92_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB92_1; ; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end @@ -4332,12 +4332,12 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB93_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB93_1; ; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end @@ -4379,12 +4379,12 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB94_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB94_1; ; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end @@ -4426,12 +4426,12 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB95_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB95_1; ; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end @@ -4473,12 +4473,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB96_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB96_1; ; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end @@ -4520,12 +4520,12 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB97_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB97_1; ; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end @@ -4567,12 +4567,12 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB98_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB98_1; ; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end @@ -4614,12 +4614,12 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB99_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB99_1; ; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end @@ -4661,12 +4661,12 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB100_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB100_1; ; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end @@ -4708,12 +4708,12 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB101_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB101_1; ; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end @@ -4755,12 +4755,12 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB102_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB102_1; ; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end @@ -4802,12 +4802,12 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB103_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB103_1; ; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end @@ -4849,12 +4849,12 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB104_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB104_1; ; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end @@ -4896,12 +4896,12 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB105_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB105_1; ; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end @@ -4943,12 +4943,12 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB106_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB106_1; ; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end @@ -4990,12 +4990,12 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB107_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB107_1; ; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end @@ -5037,12 +5037,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB108_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB108_1; ; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end @@ -5084,12 +5084,12 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB109_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB109_1; ; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end @@ -5131,12 +5131,12 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB110_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB110_1; ; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end @@ -5178,12 +5178,12 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB111_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB111_1; ; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end @@ -5225,12 +5225,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB112_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB112_1; ; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end @@ -5272,12 +5272,12 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB113_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB113_1; ; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end @@ -5319,12 +5319,12 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB114_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB114_1; ; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end @@ -5366,12 +5366,12 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB115_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB115_1; ; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end @@ -5413,12 +5413,12 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB116_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB116_1; ; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end @@ -5460,12 +5460,12 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB117_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB117_1; ; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end @@ -5507,12 +5507,12 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB118_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB118_1; ; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end @@ -5554,12 +5554,12 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB119_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB119_1; ; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end @@ -5601,12 +5601,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB120_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB120_1; ; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end @@ -5648,12 +5648,12 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB121_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB121_1; ; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end @@ -5695,12 +5695,12 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB122_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB122_1; ; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end @@ -5742,12 +5742,12 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB123_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB123_1; ; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end @@ -5789,12 +5789,12 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB124_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB124_1; ; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end @@ -5836,12 +5836,12 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB125_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB125_1; ; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end @@ -5883,12 +5883,12 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB126_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB126_1; ; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end @@ -5930,12 +5930,12 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB127_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB127_1; ; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end @@ -5977,12 +5977,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB128_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB128_1; ; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end @@ -6024,12 +6024,12 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB129_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB129_1; ; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end @@ -6071,12 +6071,12 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB130_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB130_1; ; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end @@ -6118,12 +6118,12 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB131_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB131_1; ; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end @@ -6165,12 +6165,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB132_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB132_1; ; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end @@ -6212,12 +6212,12 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB133_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB133_1; ; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end @@ -6259,12 +6259,12 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB134_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB134_1; ; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end @@ -6306,12 +6306,12 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB135_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB135_1; ; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end @@ -6353,12 +6353,12 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB136_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB136_1; ; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end @@ -6400,12 +6400,12 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB137_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB137_1; ; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end @@ -6447,12 +6447,12 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB138_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB138_1; ; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end @@ -6494,12 +6494,12 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB139_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB139_1; ; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end @@ -6541,12 +6541,12 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB140_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB140_1; ; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end @@ -6588,12 +6588,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB141_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB141_1; ; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end @@ -6635,12 +6635,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB142_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB142_1; ; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end @@ -6682,12 +6682,12 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB143_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB143_1; ; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end @@ -6729,12 +6729,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB144_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB144_1; ; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end @@ -6776,12 +6776,12 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB145_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB145_1; ; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end @@ -6823,12 +6823,12 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB146_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB146_1; ; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end @@ -6870,12 +6870,12 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB147_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB147_1; ; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end @@ -6917,12 +6917,12 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB148_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB148_1; ; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end @@ -6964,12 +6964,12 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB149_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB149_1; ; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end @@ -7011,12 +7011,12 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB150_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB150_1; ; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end @@ -7058,12 +7058,12 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB151_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB151_1; ; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end @@ -7105,12 +7105,12 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB152_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB152_1; ; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end @@ -7152,12 +7152,12 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB153_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB153_1; ; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end @@ -7199,12 +7199,12 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB154_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB154_1; ; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end @@ -7246,12 +7246,12 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB155_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB155_1; ; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end @@ -7293,12 +7293,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB156_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB156_1; ; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end @@ -7340,12 +7340,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB157_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB157_1; ; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end @@ -7387,12 +7387,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB158_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB158_1; ; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end @@ -7434,12 +7434,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB159_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB159_1; ; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end @@ -7481,12 +7481,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB160_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB160_1; ; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end @@ -7528,12 +7528,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB161_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB161_1; ; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end @@ -7575,12 +7575,12 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB162_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB162_1; ; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end @@ -7622,12 +7622,12 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB163_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB163_1; ; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end @@ -7669,12 +7669,12 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB164_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB164_1; ; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end @@ -7716,12 +7716,12 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB165_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB165_1; ; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end @@ -7763,12 +7763,12 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB166_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB166_1; ; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end @@ -7810,12 +7810,12 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB167_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB167_1; ; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end @@ -7857,12 +7857,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB168_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB168_1; ; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end @@ -7904,12 +7904,12 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB169_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB169_1; ; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end @@ -7951,12 +7951,12 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB170_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB170_1; ; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end @@ -7998,12 +7998,12 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB171_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB171_1; ; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end @@ -8045,12 +8045,12 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB172_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB172_1; ; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end @@ -8092,12 +8092,12 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB173_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB173_1; ; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end @@ -8139,12 +8139,12 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB174_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB174_1; ; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end @@ -8186,12 +8186,12 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB175_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB175_1; ; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end @@ -8233,12 +8233,12 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB176_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB176_1; ; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end @@ -8280,12 +8280,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB177_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB177_1; ; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end @@ -8327,12 +8327,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB178_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB178_1; ; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end @@ -8374,12 +8374,12 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB179_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB179_1; ; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end @@ -8419,12 +8419,12 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB180_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB180_1; ; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end @@ -8463,12 +8463,12 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB181_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB181_1; ; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end @@ -8507,12 +8507,12 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB182_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB182_1; ; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end @@ -8551,12 +8551,12 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB183_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB183_1; ; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end @@ -8595,12 +8595,12 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB184_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB184_1; ; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end @@ -8639,12 +8639,12 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB185_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB185_1; ; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end @@ -8683,12 +8683,12 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB186_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB186_1; ; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end @@ -8727,12 +8727,12 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB187_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB187_1; ; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end @@ -8771,12 +8771,12 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB188_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB188_1; ; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end @@ -8815,12 +8815,12 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB189_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB189_1; ; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end @@ -8859,12 +8859,12 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB190_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB190_1; ; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end @@ -8903,12 +8903,12 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB191_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB191_1; ; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end @@ -8947,12 +8947,12 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB192_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB192_1; ; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end @@ -8992,12 +8992,12 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB193_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB193_1; ; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end @@ -9037,12 +9037,12 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB194_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB194_1; ; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end @@ -9082,12 +9082,12 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB195_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB195_1; ; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end @@ -9127,12 +9127,12 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB196_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB196_1; ; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end @@ -9172,12 +9172,12 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB197_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB197_1; ; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end @@ -9217,12 +9217,12 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB198_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB198_1; ; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end @@ -9262,12 +9262,12 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB199_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB199_1; ; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end @@ -9307,12 +9307,12 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB200_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB200_1; ; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end @@ -9352,12 +9352,12 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB201_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB201_1; ; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end @@ -9397,12 +9397,12 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB202_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB202_1; ; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end @@ -9442,12 +9442,12 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB203_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB203_1; ; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end @@ -9488,12 +9488,12 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB204_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB204_1; ; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end @@ -9534,12 +9534,12 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB205_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB205_1; ; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end @@ -9580,12 +9580,12 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB206_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB206_1; ; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end @@ -9626,12 +9626,12 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB207_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB207_1; ; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end @@ -9672,12 +9672,12 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB208_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB208_1; ; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end @@ -9718,12 +9718,12 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB209_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB209_1; ; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end @@ -9764,12 +9764,12 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB210_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB210_1; ; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end @@ -9810,12 +9810,12 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB211_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB211_1; ; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end @@ -9856,12 +9856,12 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB212_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB212_1; ; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end @@ -9902,12 +9902,12 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB213_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB213_1; ; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end @@ -9948,12 +9948,12 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB214_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB214_1; ; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end @@ -9994,12 +9994,12 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB215_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB215_1; ; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end @@ -10039,12 +10039,12 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB216_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB216_1; ; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end @@ -10084,12 +10084,12 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB217_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB217_1; ; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end @@ -10129,12 +10129,12 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB218_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB218_1; ; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end @@ -10174,12 +10174,12 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB219_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB219_1; ; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end @@ -10219,12 +10219,12 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB220_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB220_1; ; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end @@ -10264,12 +10264,12 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB221_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB221_1; ; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end @@ -10309,12 +10309,12 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB222_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB222_1; ; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end @@ -10354,12 +10354,12 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB223_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB223_1; ; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end @@ -10399,12 +10399,12 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB224_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB224_1; ; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end @@ -10444,12 +10444,12 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB225_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB225_1; ; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end @@ -10489,12 +10489,12 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB226_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB226_1; ; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end @@ -10534,12 +10534,12 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB227_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB227_1; ; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end @@ -10579,12 +10579,12 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB228_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB228_1; ; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end @@ -10624,12 +10624,12 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB229_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB229_1; ; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end @@ -10669,12 +10669,12 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB230_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB230_1; ; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end @@ -10714,12 +10714,12 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB231_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB231_1; ; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end @@ -10759,12 +10759,12 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB232_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB232_1; ; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end @@ -10804,12 +10804,12 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB233_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB233_1; ; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end @@ -10849,12 +10849,12 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB234_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB234_1; ; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end @@ -10894,12 +10894,12 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB235_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB235_1; ; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end @@ -10939,12 +10939,12 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB236_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB236_1; ; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end @@ -10984,12 +10984,12 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB237_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB237_1; ; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end @@ -11029,12 +11029,12 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB238_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB238_1; ; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end @@ -11074,12 +11074,12 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB239_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB239_1; ; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end @@ -11120,12 +11120,12 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB240_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB240_1; ; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end @@ -11166,12 +11166,12 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB241_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB241_1; ; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end @@ -11212,12 +11212,12 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB242_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB242_1; ; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end @@ -11258,12 +11258,12 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB243_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB243_1; ; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end @@ -11304,12 +11304,12 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB244_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB244_1; ; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end @@ -11350,12 +11350,12 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB245_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB245_1; ; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end @@ -11396,12 +11396,12 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB246_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB246_1; ; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end @@ -11442,12 +11442,12 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB247_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB247_1; ; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end @@ -11488,12 +11488,12 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB248_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB248_1; ; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end @@ -11534,12 +11534,12 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB249_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB249_1; ; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end @@ -11580,12 +11580,12 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB250_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB250_1; ; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end @@ -11626,12 +11626,12 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB251_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB251_1; ; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end @@ -11672,12 +11672,12 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB252_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB252_1; ; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end @@ -11717,12 +11717,12 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB253_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB253_1; ; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end @@ -11762,12 +11762,12 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB254_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB254_1; ; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end @@ -11807,12 +11807,12 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB255_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB255_1; ; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end @@ -11852,12 +11852,12 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB256_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB256_1; ; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end @@ -11897,12 +11897,12 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB257_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB257_1; ; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end @@ -11942,12 +11942,12 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB258_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB258_1; ; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end @@ -11987,12 +11987,12 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB259_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB259_1; ; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end @@ -12032,12 +12032,12 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB260_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB260_1; ; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end @@ -12077,12 +12077,12 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB261_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB261_1; ; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end @@ -12122,12 +12122,12 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB262_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB262_1; ; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end @@ -12167,12 +12167,12 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB263_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB263_1; ; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end @@ -12212,12 +12212,12 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB264_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB264_1; ; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end @@ -12258,12 +12258,12 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB265_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB265_1; ; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end @@ -12304,12 +12304,12 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB266_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB266_1; ; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end @@ -12350,12 +12350,12 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB267_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB267_1; ; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end @@ -12396,12 +12396,12 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB268_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB268_1; ; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end @@ -12442,12 +12442,12 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB269_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB269_1; ; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end @@ -12488,12 +12488,12 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB270_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB270_1; ; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end @@ -12534,12 +12534,12 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB271_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB271_1; ; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end @@ -12580,12 +12580,12 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB272_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB272_1; ; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end @@ -12626,12 +12626,12 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB273_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB273_1; ; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end @@ -12672,12 +12672,12 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB274_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB274_1; ; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end @@ -12718,12 +12718,12 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB275_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB275_1; ; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end @@ -12764,12 +12764,12 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB276_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB276_1; ; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end @@ -12810,12 +12810,12 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB277_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB277_1; ; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end @@ -12856,12 +12856,12 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB278_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB278_1; ; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end @@ -12902,12 +12902,12 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB279_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB279_1; ; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end @@ -12948,12 +12948,12 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB280_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB280_1; ; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end @@ -12994,12 +12994,12 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB281_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB281_1; ; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end @@ -13040,12 +13040,12 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB282_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB282_1; ; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end @@ -13086,12 +13086,12 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB283_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB283_1; ; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end @@ -13132,12 +13132,12 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB284_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB284_1; ; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end @@ -13178,12 +13178,12 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB285_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB285_1; ; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end @@ -13224,12 +13224,12 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB286_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB286_1; ; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end @@ -13270,12 +13270,12 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB287_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB287_1; ; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end @@ -13316,12 +13316,12 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB288_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB288_1; ; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end @@ -13362,12 +13362,12 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB289_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB289_1; ; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end @@ -13408,12 +13408,12 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB290_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB290_1; ; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end @@ -13454,12 +13454,12 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB291_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB291_1; ; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end @@ -13500,12 +13500,12 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB292_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB292_1; ; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end @@ -13546,12 +13546,12 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB293_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB293_1; ; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end @@ -13592,12 +13592,12 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB294_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB294_1; ; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end @@ -13638,12 +13638,12 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB295_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB295_1; ; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end @@ -13684,12 +13684,12 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB296_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB296_1; ; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end @@ -13730,12 +13730,12 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB297_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB297_1; ; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end @@ -13776,12 +13776,12 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB298_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB298_1; ; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end @@ -13822,12 +13822,12 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB299_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB299_1; ; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end @@ -13868,12 +13868,12 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB300_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB300_1; ; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end @@ -13914,12 +13914,12 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB301_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB301_1; ; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end @@ -13960,12 +13960,12 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB302_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB302_1; ; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end @@ -14006,12 +14006,12 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB303_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB303_1; ; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end @@ -14052,12 +14052,12 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB304_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB304_1; ; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end @@ -14098,12 +14098,12 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB305_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB305_1; ; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end @@ -14144,12 +14144,12 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB306_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB306_1; ; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end @@ -14190,12 +14190,12 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB307_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB307_1; ; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end @@ -14236,12 +14236,12 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB308_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB308_1; ; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end @@ -14282,12 +14282,12 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB309_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB309_1; ; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end @@ -14328,12 +14328,12 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB310_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB310_1; ; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end @@ -14374,12 +14374,12 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB311_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB311_1; ; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end @@ -14420,12 +14420,12 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB312_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB312_1; ; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end @@ -14466,12 +14466,12 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB313_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB313_1; ; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end @@ -14512,12 +14512,12 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB314_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB314_1; ; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end @@ -14558,12 +14558,12 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB315_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB315_1; ; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end @@ -14604,12 +14604,12 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB316_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB316_1; ; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end @@ -14650,12 +14650,12 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB317_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB317_1; ; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end @@ -14696,12 +14696,12 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB318_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB318_1; ; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end @@ -14742,12 +14742,12 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB319_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB319_1; ; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end @@ -14788,12 +14788,12 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB320_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB320_1; ; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end @@ -14834,12 +14834,12 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB321_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB321_1; ; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end @@ -14880,12 +14880,12 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB322_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB322_1; ; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end @@ -14926,12 +14926,12 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB323_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB323_1; ; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end @@ -14972,12 +14972,12 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB324_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB324_1; ; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end @@ -15018,12 +15018,12 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB325_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB325_1; ; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end @@ -15064,12 +15064,12 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB326_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB326_1; ; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end @@ -15110,12 +15110,12 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB327_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB327_1; ; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end @@ -15156,12 +15156,12 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB328_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB328_1; ; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end @@ -15202,12 +15202,12 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB329_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB329_1; ; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end @@ -15248,12 +15248,12 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB330_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB330_1; ; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end @@ -15294,12 +15294,12 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB331_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB331_1; ; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end @@ -15340,12 +15340,12 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB332_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB332_1; ; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end @@ -15386,12 +15386,12 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB333_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB333_1; ; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end @@ -15432,12 +15432,12 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB334_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB334_1; ; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end @@ -15478,12 +15478,12 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB335_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB335_1; ; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end @@ -15524,12 +15524,12 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB336_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB336_1; ; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end @@ -15570,12 +15570,12 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB337_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB337_1; ; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end @@ -15616,12 +15616,12 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB338_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB338_1; ; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end @@ -15662,12 +15662,12 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB339_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB339_1; ; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end @@ -15708,12 +15708,12 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB340_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB340_1; ; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end @@ -15754,12 +15754,12 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB341_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB341_1; ; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end @@ -15800,12 +15800,12 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB342_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB342_1; ; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end @@ -15846,12 +15846,12 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB343_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB343_1; ; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end @@ -15892,12 +15892,12 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB344_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB344_1; ; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end @@ -15938,12 +15938,12 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB345_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB345_1; ; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end @@ -15984,12 +15984,12 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB346_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB346_1; ; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end @@ -16030,12 +16030,12 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB347_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB347_1; ; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end @@ -16076,12 +16076,12 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB348_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB348_1; ; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end @@ -16122,12 +16122,12 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB349_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB349_1; ; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end @@ -16168,12 +16168,12 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB350_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB350_1; ; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end @@ -16214,12 +16214,12 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB351_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB351_1; ; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end @@ -16260,12 +16260,12 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB352_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB352_1; ; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end @@ -16306,12 +16306,12 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB353_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB353_1; ; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end @@ -16352,12 +16352,12 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB354_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB354_1; ; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end @@ -16398,12 +16398,12 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB355_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB355_1; ; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end @@ -16444,12 +16444,12 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB356_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB356_1; ; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end @@ -16490,12 +16490,12 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB357_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB357_1; ; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end @@ -16536,12 +16536,12 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB358_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB358_1; ; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end @@ -16582,12 +16582,12 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: or.b32 %r16, %r19, %r3; ; SM60-NEXT: or.b32 %r17, %r19, %r4; ; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB359_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB359_1; ; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index c29bd0be840ba..07b2f2f8fa9b0 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -121,7 +121,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -166,7 +166,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -211,7 +211,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -256,7 +256,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -301,7 +301,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -346,7 +346,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -391,7 +391,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -436,7 +436,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -482,7 +482,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -528,7 +528,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -574,7 +574,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -620,7 +620,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -666,7 +666,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -712,7 +712,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -758,7 +758,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -804,7 +804,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -851,7 +851,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -898,7 +898,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -945,7 +945,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -992,7 +992,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1039,7 +1039,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1086,7 +1086,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1133,7 +1133,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1180,7 +1180,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1227,7 +1227,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1273,7 +1273,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1319,7 +1319,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1365,7 +1365,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1411,7 +1411,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB30_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1457,7 +1457,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB31_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1503,7 +1503,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB32_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1549,7 +1549,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB33_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1595,7 +1595,7 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB34_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1641,7 +1641,7 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB35_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1687,7 +1687,7 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB36_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1733,7 +1733,7 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB37_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1779,7 +1779,7 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB38_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1825,7 +1825,7 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB39_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1871,7 +1871,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB40_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1917,7 +1917,7 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB41_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1963,7 +1963,7 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB42_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2009,7 +2009,7 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB43_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2055,7 +2055,7 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB44_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2100,16 +2100,16 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB45_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB45_1; ; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -2147,16 +2147,16 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB46_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB46_1; ; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -2194,16 +2194,16 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB47_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB47_1; ; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -2241,16 +2241,16 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB48_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB48_1; ; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -2288,16 +2288,16 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB49_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB49_1; ; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -2335,16 +2335,16 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB50_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB50_1; ; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -2382,16 +2382,16 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB51_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB51_1; ; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -2429,16 +2429,16 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB52_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB52_1; ; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -2476,16 +2476,16 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB53_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB53_1; ; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -2523,16 +2523,16 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB54_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB54_1; ; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2569,16 +2569,16 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB55_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB55_1; ; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2615,16 +2615,16 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB56_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB56_1; ; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2661,16 +2661,16 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB57_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB57_1; ; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2707,16 +2707,16 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB58_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB58_1; ; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2753,16 +2753,16 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB59_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB59_1; ; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2799,16 +2799,16 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2845,16 +2845,16 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB61_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB61_1; ; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2891,16 +2891,16 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB62_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB62_1; ; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; @@ -2937,16 +2937,16 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB63_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB63_1; ; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -2984,16 +2984,16 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3031,16 +3031,16 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3078,16 +3078,16 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB66_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB66_1; ; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3125,16 +3125,16 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB67_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB67_1; ; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3172,16 +3172,16 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB68_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB68_1; ; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3219,16 +3219,16 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB69_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB69_1; ; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3266,16 +3266,16 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB70_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB70_1; ; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3313,16 +3313,16 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB71_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB71_1; ; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3360,16 +3360,16 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB72_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB72_1; ; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3407,16 +3407,16 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB73_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB73_1; ; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3454,16 +3454,16 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB74_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB74_1; ; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3501,16 +3501,16 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB75_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB75_1; ; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3547,17 +3547,17 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB76_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB76_1; ; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3595,16 +3595,16 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB77_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB77_1; ; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3642,16 +3642,16 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB78_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB78_1; ; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3689,16 +3689,16 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB79_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB79_1; ; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3736,16 +3736,16 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB80_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB80_1; ; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3783,16 +3783,16 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB81_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB81_1; ; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3830,16 +3830,16 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB82_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB82_1; ; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -3877,16 +3877,16 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB83_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB83_1; ; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -3924,16 +3924,16 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB84_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB84_1; ; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -3971,16 +3971,16 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB85_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB85_1; ; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4018,16 +4018,16 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB86_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB86_1; ; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4065,16 +4065,16 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB87_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB87_1; ; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -4112,16 +4112,16 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB88_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB88_1; ; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -4159,16 +4159,16 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB89_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB89_1; ; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.gpu; @@ -4209,12 +4209,12 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB90_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB90_1; ; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end @@ -4256,12 +4256,12 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB91_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB91_1; ; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end @@ -4303,12 +4303,12 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB92_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB92_1; ; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end @@ -4350,12 +4350,12 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB93_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB93_1; ; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end @@ -4397,12 +4397,12 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB94_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB94_1; ; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end @@ -4444,12 +4444,12 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB95_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB95_1; ; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end @@ -4491,12 +4491,12 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB96_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB96_1; ; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end @@ -4538,12 +4538,12 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB97_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB97_1; ; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end @@ -4585,12 +4585,12 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB98_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB98_1; ; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end @@ -4632,12 +4632,12 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB99_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB99_1; ; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end @@ -4679,12 +4679,12 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB100_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB100_1; ; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end @@ -4726,12 +4726,12 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB101_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB101_1; ; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end @@ -4773,12 +4773,12 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB102_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB102_1; ; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end @@ -4820,12 +4820,12 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB103_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB103_1; ; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end @@ -4867,12 +4867,12 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB104_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB104_1; ; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end @@ -4914,12 +4914,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB105_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB105_1; ; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end @@ -4961,12 +4961,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB106_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB106_1; ; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end @@ -5008,12 +5008,12 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB107_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB107_1; ; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end @@ -5055,12 +5055,12 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB108_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB108_1; ; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end @@ -5102,12 +5102,12 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB109_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB109_1; ; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end @@ -5149,12 +5149,12 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB110_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB110_1; ; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end @@ -5196,12 +5196,12 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB111_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB111_1; ; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end @@ -5243,12 +5243,12 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB112_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB112_1; ; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end @@ -5290,12 +5290,12 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB113_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB113_1; ; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end @@ -5337,12 +5337,12 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB114_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB114_1; ; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end @@ -5384,12 +5384,12 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB115_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB115_1; ; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end @@ -5431,12 +5431,12 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB116_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB116_1; ; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end @@ -5478,12 +5478,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB117_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB117_1; ; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end @@ -5525,12 +5525,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB118_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB118_1; ; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end @@ -5572,12 +5572,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB119_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB119_1; ; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end @@ -5619,12 +5619,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB120_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB120_1; ; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end @@ -5666,12 +5666,12 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB121_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB121_1; ; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end @@ -5713,12 +5713,12 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB122_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB122_1; ; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end @@ -5760,12 +5760,12 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB123_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB123_1; ; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end @@ -5807,12 +5807,12 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB124_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB124_1; ; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end @@ -5854,12 +5854,12 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB125_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB125_1; ; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end @@ -5901,12 +5901,12 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB126_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB126_1; ; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end @@ -5948,12 +5948,12 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB127_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB127_1; ; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end @@ -5995,12 +5995,12 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB128_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB128_1; ; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end @@ -6042,12 +6042,12 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB129_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB129_1; ; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end @@ -6089,12 +6089,12 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB130_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB130_1; ; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end @@ -6136,12 +6136,12 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB131_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB131_1; ; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end @@ -6183,12 +6183,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB132_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB132_1; ; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end @@ -6230,12 +6230,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB133_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB133_1; ; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end @@ -6277,12 +6277,12 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB134_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB134_1; ; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end @@ -6322,12 +6322,12 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB135_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB135_1; ; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end @@ -6366,12 +6366,12 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB136_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB136_1; ; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end @@ -6410,12 +6410,12 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB137_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB137_1; ; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end @@ -6454,12 +6454,12 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB138_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB138_1; ; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end @@ -6498,12 +6498,12 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB139_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB139_1; ; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end @@ -6542,12 +6542,12 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB140_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB140_1; ; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end @@ -6586,12 +6586,12 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB141_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB141_1; ; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end @@ -6630,12 +6630,12 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB142_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB142_1; ; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end @@ -6674,12 +6674,12 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB143_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB143_1; ; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end @@ -6718,12 +6718,12 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB144_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB144_1; ; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end @@ -6763,12 +6763,12 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB145_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB145_1; ; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end @@ -6808,12 +6808,12 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB146_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB146_1; ; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end @@ -6853,12 +6853,12 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB147_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB147_1; ; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end @@ -6898,12 +6898,12 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB148_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB148_1; ; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end @@ -6943,12 +6943,12 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB149_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB149_1; ; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end @@ -6988,12 +6988,12 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB150_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB150_1; ; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end @@ -7033,12 +7033,12 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB151_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB151_1; ; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end @@ -7078,12 +7078,12 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB152_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB152_1; ; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end @@ -7124,12 +7124,12 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB153_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB153_1; ; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end @@ -7170,12 +7170,12 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB154_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB154_1; ; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end @@ -7216,12 +7216,12 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB155_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB155_1; ; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end @@ -7262,12 +7262,12 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB156_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB156_1; ; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end @@ -7308,12 +7308,12 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB157_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB157_1; ; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end @@ -7354,12 +7354,12 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB158_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB158_1; ; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end @@ -7400,12 +7400,12 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB159_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB159_1; ; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end @@ -7446,12 +7446,12 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB160_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB160_1; ; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end @@ -7492,12 +7492,12 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB161_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB161_1; ; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end @@ -7537,12 +7537,12 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB162_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB162_1; ; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end @@ -7582,12 +7582,12 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB163_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB163_1; ; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end @@ -7627,12 +7627,12 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB164_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB164_1; ; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end @@ -7672,12 +7672,12 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB165_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB165_1; ; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end @@ -7717,12 +7717,12 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB166_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB166_1; ; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end @@ -7762,12 +7762,12 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB167_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB167_1; ; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end @@ -7807,12 +7807,12 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB168_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB168_1; ; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end @@ -7852,12 +7852,12 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB169_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB169_1; ; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end @@ -7897,12 +7897,12 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB170_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB170_1; ; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end @@ -7942,12 +7942,12 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB171_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB171_1; ; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end @@ -7987,12 +7987,12 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB172_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB172_1; ; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end @@ -8032,12 +8032,12 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB173_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB173_1; ; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end @@ -8077,12 +8077,12 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB174_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB174_1; ; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end @@ -8122,12 +8122,12 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB175_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB175_1; ; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end @@ -8167,12 +8167,12 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB176_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB176_1; ; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end @@ -8212,12 +8212,12 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB177_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB177_1; ; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end @@ -8257,12 +8257,12 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB178_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB178_1; ; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end @@ -8302,12 +8302,12 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB179_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB179_1; ; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end @@ -8348,12 +8348,12 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB180_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB180_1; ; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end @@ -8394,12 +8394,12 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB181_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB181_1; ; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end @@ -8440,12 +8440,12 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB182_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB182_1; ; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end @@ -8486,12 +8486,12 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB183_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB183_1; ; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end @@ -8532,12 +8532,12 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB184_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB184_1; ; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end @@ -8578,12 +8578,12 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB185_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB185_1; ; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end @@ -8624,12 +8624,12 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB186_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB186_1; ; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end @@ -8670,12 +8670,12 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB187_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB187_1; ; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end @@ -8716,12 +8716,12 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB188_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB188_1; ; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end @@ -8762,12 +8762,12 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB189_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB189_1; ; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end @@ -8807,12 +8807,12 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB190_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB190_1; ; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end @@ -8852,12 +8852,12 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB191_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB191_1; ; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end @@ -8897,12 +8897,12 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB192_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB192_1; ; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end @@ -8942,12 +8942,12 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB193_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB193_1; ; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end @@ -8987,12 +8987,12 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB194_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB194_1; ; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end @@ -9032,12 +9032,12 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB195_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB195_1; ; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end @@ -9077,12 +9077,12 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB196_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB196_1; ; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end @@ -9122,12 +9122,12 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB197_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB197_1; ; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end @@ -9167,12 +9167,12 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB198_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB198_1; ; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end @@ -9213,12 +9213,12 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB199_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB199_1; ; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end @@ -9259,12 +9259,12 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB200_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB200_1; ; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end @@ -9305,12 +9305,12 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB201_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB201_1; ; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end @@ -9351,12 +9351,12 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB202_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB202_1; ; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end @@ -9397,12 +9397,12 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB203_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB203_1; ; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end @@ -9443,12 +9443,12 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB204_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB204_1; ; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end @@ -9489,12 +9489,12 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB205_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB205_1; ; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end @@ -9535,12 +9535,12 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB206_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB206_1; ; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end @@ -9581,12 +9581,12 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB207_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB207_1; ; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end @@ -9627,12 +9627,12 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB208_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB208_1; ; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end @@ -9673,12 +9673,12 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB209_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB209_1; ; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end @@ -9719,12 +9719,12 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB210_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB210_1; ; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end @@ -9765,12 +9765,12 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB211_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB211_1; ; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end @@ -9811,12 +9811,12 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB212_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB212_1; ; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end @@ -9857,12 +9857,12 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB213_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB213_1; ; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end @@ -9903,12 +9903,12 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB214_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB214_1; ; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end @@ -9949,12 +9949,12 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB215_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB215_1; ; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end @@ -9995,12 +9995,12 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB216_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB216_1; ; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end @@ -10041,12 +10041,12 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB217_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB217_1; ; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end @@ -10087,12 +10087,12 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB218_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB218_1; ; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end @@ -10133,12 +10133,12 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB219_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB219_1; ; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end @@ -10179,12 +10179,12 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB220_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB220_1; ; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end @@ -10225,12 +10225,12 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB221_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB221_1; ; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end @@ -10271,12 +10271,12 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB222_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB222_1; ; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end @@ -10317,12 +10317,12 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB223_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB223_1; ; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end @@ -10363,12 +10363,12 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB224_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB224_1; ; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end @@ -10409,12 +10409,12 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB225_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB225_1; ; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end @@ -10455,12 +10455,12 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB226_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB226_1; ; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end @@ -10501,12 +10501,12 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB227_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB227_1; ; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end @@ -10547,12 +10547,12 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB228_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB228_1; ; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end @@ -10593,12 +10593,12 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB229_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB229_1; ; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end @@ -10639,12 +10639,12 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB230_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB230_1; ; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end @@ -10685,12 +10685,12 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB231_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB231_1; ; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end @@ -10731,12 +10731,12 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB232_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB232_1; ; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end @@ -10777,12 +10777,12 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB233_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB233_1; ; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end @@ -10823,12 +10823,12 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB234_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB234_1; ; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end @@ -10869,12 +10869,12 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB235_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB235_1; ; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end @@ -10915,12 +10915,12 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB236_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB236_1; ; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end @@ -10961,12 +10961,12 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB237_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB237_1; ; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end @@ -11007,12 +11007,12 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB238_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB238_1; ; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end @@ -11053,12 +11053,12 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB239_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB239_1; ; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end @@ -11099,12 +11099,12 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB240_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB240_1; ; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end @@ -11145,12 +11145,12 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB241_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB241_1; ; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end @@ -11191,12 +11191,12 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB242_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB242_1; ; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end @@ -11237,12 +11237,12 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB243_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB243_1; ; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end @@ -11283,12 +11283,12 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB244_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB244_1; ; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end @@ -11329,12 +11329,12 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB245_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB245_1; ; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end @@ -11375,12 +11375,12 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB246_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB246_1; ; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end @@ -11421,12 +11421,12 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB247_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB247_1; ; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end @@ -11467,12 +11467,12 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB248_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB248_1; ; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end @@ -11513,12 +11513,12 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB249_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB249_1; ; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end @@ -11559,12 +11559,12 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB250_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB250_1; ; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end @@ -11605,12 +11605,12 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB251_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB251_1; ; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end @@ -11651,12 +11651,12 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB252_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB252_1; ; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end @@ -11697,12 +11697,12 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB253_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB253_1; ; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end @@ -11743,12 +11743,12 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB254_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB254_1; ; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end @@ -11789,12 +11789,12 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB255_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB255_1; ; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end @@ -11835,12 +11835,12 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB256_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB256_1; ; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end @@ -11881,12 +11881,12 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB257_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB257_1; ; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end @@ -11927,12 +11927,12 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB258_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB258_1; ; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end @@ -11973,12 +11973,12 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB259_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB259_1; ; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end @@ -12019,12 +12019,12 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB260_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB260_1; ; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end @@ -12065,12 +12065,12 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB261_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB261_1; ; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end @@ -12111,12 +12111,12 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB262_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB262_1; ; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end @@ -12157,12 +12157,12 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB263_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB263_1; ; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end @@ -12203,12 +12203,12 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB264_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB264_1; ; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end @@ -12249,12 +12249,12 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB265_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB265_1; ; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end @@ -12295,12 +12295,12 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB266_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB266_1; ; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end @@ -12341,12 +12341,12 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB267_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB267_1; ; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end @@ -12387,12 +12387,12 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB268_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB268_1; ; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end @@ -12433,12 +12433,12 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; ; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB269_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB269_1; ; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index a45c95fccf0cb..b05a404b6be7e 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -121,7 +121,7 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -166,7 +166,7 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -211,7 +211,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -256,7 +256,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -301,7 +301,7 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -346,7 +346,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -391,7 +391,7 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -436,7 +436,7 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -481,7 +481,7 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -526,7 +526,7 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -571,7 +571,7 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -617,7 +617,7 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -663,7 +663,7 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -709,7 +709,7 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -755,7 +755,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -801,7 +801,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -847,7 +847,7 @@ define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -893,7 +893,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -939,7 +939,7 @@ define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -985,7 +985,7 @@ define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1031,7 +1031,7 @@ define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1077,7 +1077,7 @@ define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1124,7 +1124,7 @@ define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1171,7 +1171,7 @@ define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1218,7 +1218,7 @@ define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1265,7 +1265,7 @@ define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1312,7 +1312,7 @@ define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1359,7 +1359,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1406,7 +1406,7 @@ define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB30_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1453,7 +1453,7 @@ define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB31_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1500,7 +1500,7 @@ define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB32_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1547,7 +1547,7 @@ define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB33_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1594,7 +1594,7 @@ define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB34_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1641,7 +1641,7 @@ define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB35_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1687,7 +1687,7 @@ define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB36_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1733,7 +1733,7 @@ define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB37_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1779,7 +1779,7 @@ define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB38_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1825,7 +1825,7 @@ define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB39_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1871,7 +1871,7 @@ define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB40_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1917,7 +1917,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB41_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1963,7 +1963,7 @@ define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB42_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2009,7 +2009,7 @@ define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB43_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2055,7 +2055,7 @@ define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB44_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -2099,16 +2099,16 @@ define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB45_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB45_1; ; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2145,16 +2145,16 @@ define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB46_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB46_1; ; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -2191,16 +2191,16 @@ define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB47_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB47_1; ; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -2237,16 +2237,16 @@ define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB48_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB48_1; ; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2283,16 +2283,16 @@ define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB49_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB49_1; ; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2329,16 +2329,16 @@ define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB50_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB50_1; ; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -2375,16 +2375,16 @@ define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB51_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB51_1; ; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -2421,16 +2421,16 @@ define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB52_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB52_1; ; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2467,16 +2467,16 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB53_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB53_1; ; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2513,16 +2513,16 @@ define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB54_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB54_1; ; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -2559,16 +2559,16 @@ define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB55_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB55_1; ; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -2605,16 +2605,16 @@ define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB56_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB56_1; ; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2651,16 +2651,16 @@ define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB57_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB57_1; ; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2697,16 +2697,16 @@ define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB58_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB58_1; ; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -2743,16 +2743,16 @@ define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB59_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB59_1; ; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -2790,16 +2790,16 @@ define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2837,16 +2837,16 @@ define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB61_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB61_1; ; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2884,16 +2884,16 @@ define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB62_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB62_1; ; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -2931,16 +2931,16 @@ define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB63_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB63_1; ; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -2978,16 +2978,16 @@ define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB64_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB64_1; ; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -3025,16 +3025,16 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -3072,16 +3072,16 @@ define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -3119,16 +3119,16 @@ define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB67_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB67_1; ; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -3166,16 +3166,16 @@ define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB68_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB68_1; ; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -3213,16 +3213,16 @@ define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB69_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB69_1; ; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -3260,16 +3260,16 @@ define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB70_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB70_1; ; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -3307,16 +3307,16 @@ define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB71_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB71_1; ; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -3354,16 +3354,16 @@ define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB72_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB72_1; ; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3400,16 +3400,16 @@ define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB73_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB73_1; ; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3446,16 +3446,16 @@ define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB74_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB74_1; ; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3492,16 +3492,16 @@ define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB75_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB75_1; ; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3538,16 +3538,16 @@ define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB76_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB76_1; ; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3584,16 +3584,16 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB77_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB77_1; ; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3630,16 +3630,16 @@ define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB78_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB78_1; ; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3676,16 +3676,16 @@ define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB79_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB79_1; ; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3722,16 +3722,16 @@ define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB80_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB80_1; ; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3768,16 +3768,16 @@ define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB81_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB81_1; ; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3814,16 +3814,16 @@ define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB82_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB82_1; ; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3860,16 +3860,16 @@ define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB83_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB83_1; ; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; @@ -3906,16 +3906,16 @@ define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB84_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB84_1; ; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -3953,16 +3953,16 @@ define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB85_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB85_1; ; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4000,16 +4000,16 @@ define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB86_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB86_1; ; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cluster; @@ -4047,16 +4047,16 @@ define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB87_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB87_1; ; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.gpu; @@ -4094,16 +4094,16 @@ define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB88_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB88_1; ; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -4141,16 +4141,16 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB89_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB89_1; ; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -4191,12 +4191,12 @@ define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB90_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB90_1; ; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end @@ -4238,12 +4238,12 @@ define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB91_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB91_1; ; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end @@ -4285,12 +4285,12 @@ define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB92_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB92_1; ; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end @@ -4332,12 +4332,12 @@ define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB93_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB93_1; ; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end @@ -4379,12 +4379,12 @@ define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB94_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB94_1; ; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end @@ -4426,12 +4426,12 @@ define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB95_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB95_1; ; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end @@ -4473,12 +4473,12 @@ define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB96_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB96_1; ; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end @@ -4520,12 +4520,12 @@ define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB97_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB97_1; ; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end @@ -4567,12 +4567,12 @@ define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB98_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB98_1; ; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end @@ -4614,12 +4614,12 @@ define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB99_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB99_1; ; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end @@ -4661,12 +4661,12 @@ define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB100_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB100_1; ; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end @@ -4708,12 +4708,12 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB101_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB101_1; ; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end @@ -4755,12 +4755,12 @@ define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB102_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB102_1; ; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end @@ -4802,12 +4802,12 @@ define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB103_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB103_1; ; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end @@ -4849,12 +4849,12 @@ define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB104_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB104_1; ; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end @@ -4896,12 +4896,12 @@ define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB105_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB105_1; ; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end @@ -4943,12 +4943,12 @@ define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB106_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB106_1; ; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end @@ -4990,12 +4990,12 @@ define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB107_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB107_1; ; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end @@ -5037,12 +5037,12 @@ define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB108_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB108_1; ; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end @@ -5084,12 +5084,12 @@ define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB109_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB109_1; ; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end @@ -5131,12 +5131,12 @@ define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB110_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB110_1; ; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end @@ -5178,12 +5178,12 @@ define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB111_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB111_1; ; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end @@ -5225,12 +5225,12 @@ define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB112_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB112_1; ; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end @@ -5272,12 +5272,12 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB113_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB113_1; ; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end @@ -5319,12 +5319,12 @@ define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB114_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB114_1; ; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end @@ -5366,12 +5366,12 @@ define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB115_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB115_1; ; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end @@ -5413,12 +5413,12 @@ define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB116_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB116_1; ; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end @@ -5460,12 +5460,12 @@ define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB117_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB117_1; ; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end @@ -5507,12 +5507,12 @@ define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB118_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB118_1; ; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end @@ -5554,12 +5554,12 @@ define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB119_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB119_1; ; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end @@ -5601,12 +5601,12 @@ define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB120_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB120_1; ; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end @@ -5648,12 +5648,12 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB121_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB121_1; ; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end @@ -5695,12 +5695,12 @@ define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB122_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB122_1; ; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end @@ -5742,12 +5742,12 @@ define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB123_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB123_1; ; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end @@ -5789,12 +5789,12 @@ define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB124_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB124_1; ; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end @@ -5836,12 +5836,12 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB125_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB125_1; ; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end @@ -5883,12 +5883,12 @@ define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB126_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB126_1; ; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end @@ -5930,12 +5930,12 @@ define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB127_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB127_1; ; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end @@ -5977,12 +5977,12 @@ define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB128_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB128_1; ; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end @@ -6024,12 +6024,12 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB129_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB129_1; ; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end @@ -6071,12 +6071,12 @@ define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB130_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB130_1; ; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end @@ -6118,12 +6118,12 @@ define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB131_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB131_1; ; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end @@ -6165,12 +6165,12 @@ define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB132_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB132_1; ; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end @@ -6212,12 +6212,12 @@ define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB133_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB133_1; ; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end @@ -6259,12 +6259,12 @@ define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB134_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB134_1; ; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end @@ -6306,12 +6306,12 @@ define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB135_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB135_1; ; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end @@ -6353,12 +6353,12 @@ define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB136_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB136_1; ; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end @@ -6400,12 +6400,12 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB137_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB137_1; ; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end @@ -6447,12 +6447,12 @@ define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB138_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB138_1; ; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end @@ -6494,12 +6494,12 @@ define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB139_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB139_1; ; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end @@ -6541,12 +6541,12 @@ define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB140_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB140_1; ; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end @@ -6588,12 +6588,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB141_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB141_1; ; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end @@ -6635,12 +6635,12 @@ define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB142_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB142_1; ; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end @@ -6682,12 +6682,12 @@ define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB143_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB143_1; ; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end @@ -6729,12 +6729,12 @@ define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB144_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB144_1; ; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end @@ -6776,12 +6776,12 @@ define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB145_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB145_1; ; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end @@ -6823,12 +6823,12 @@ define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB146_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB146_1; ; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end @@ -6870,12 +6870,12 @@ define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB147_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB147_1; ; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end @@ -6917,12 +6917,12 @@ define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB148_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB148_1; ; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end @@ -6964,12 +6964,12 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB149_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB149_1; ; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end @@ -7011,12 +7011,12 @@ define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB150_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB150_1; ; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end @@ -7058,12 +7058,12 @@ define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB151_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB151_1; ; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end @@ -7105,12 +7105,12 @@ define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB152_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB152_1; ; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end @@ -7152,12 +7152,12 @@ define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB153_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB153_1; ; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end @@ -7199,12 +7199,12 @@ define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB154_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB154_1; ; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end @@ -7246,12 +7246,12 @@ define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 % ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB155_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB155_1; ; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end @@ -7293,12 +7293,12 @@ define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB156_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB156_1; ; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end @@ -7340,12 +7340,12 @@ define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB157_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB157_1; ; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end @@ -7387,12 +7387,12 @@ define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB158_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB158_1; ; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end @@ -7434,12 +7434,12 @@ define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB159_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB159_1; ; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end @@ -7481,12 +7481,12 @@ define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB160_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB160_1; ; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end @@ -7528,12 +7528,12 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB161_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB161_1; ; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end @@ -7575,12 +7575,12 @@ define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB162_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB162_1; ; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end @@ -7622,12 +7622,12 @@ define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB163_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB163_1; ; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end @@ -7669,12 +7669,12 @@ define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB164_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB164_1; ; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end @@ -7716,12 +7716,12 @@ define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB165_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB165_1; ; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end @@ -7763,12 +7763,12 @@ define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB166_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB166_1; ; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end @@ -7810,12 +7810,12 @@ define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB167_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB167_1; ; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end @@ -7857,12 +7857,12 @@ define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB168_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB168_1; ; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end @@ -7904,12 +7904,12 @@ define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB169_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB169_1; ; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end @@ -7951,12 +7951,12 @@ define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB170_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB170_1; ; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end @@ -7998,12 +7998,12 @@ define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB171_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB171_1; ; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end @@ -8045,12 +8045,12 @@ define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB172_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB172_1; ; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end @@ -8092,12 +8092,12 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB173_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB173_1; ; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end @@ -8139,12 +8139,12 @@ define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB174_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB174_1; ; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end @@ -8186,12 +8186,12 @@ define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB175_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB175_1; ; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end @@ -8233,12 +8233,12 @@ define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB176_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB176_1; ; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end @@ -8280,12 +8280,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB177_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB177_1; ; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end @@ -8327,12 +8327,12 @@ define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB178_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB178_1; ; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end @@ -8374,12 +8374,12 @@ define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB179_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB179_1; ; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end @@ -8419,12 +8419,12 @@ define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB180_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB180_1; ; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end @@ -8463,12 +8463,12 @@ define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB181_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB181_1; ; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end @@ -8507,12 +8507,12 @@ define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %ne ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB182_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB182_1; ; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end @@ -8551,12 +8551,12 @@ define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB183_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB183_1; ; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end @@ -8595,12 +8595,12 @@ define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB184_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB184_1; ; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end @@ -8639,12 +8639,12 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB185_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB185_1; ; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end @@ -8683,12 +8683,12 @@ define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 % ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB186_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB186_1; ; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end @@ -8727,12 +8727,12 @@ define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB187_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB187_1; ; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end @@ -8771,12 +8771,12 @@ define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB188_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB188_1; ; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end @@ -8815,12 +8815,12 @@ define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB189_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB189_1; ; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end @@ -8859,12 +8859,12 @@ define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 % ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB190_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB190_1; ; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end @@ -8903,12 +8903,12 @@ define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB191_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB191_1; ; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end @@ -8947,12 +8947,12 @@ define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB192_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB192_1; ; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end @@ -8992,12 +8992,12 @@ define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB193_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB193_1; ; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end @@ -9037,12 +9037,12 @@ define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB194_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB194_1; ; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end @@ -9082,12 +9082,12 @@ define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB195_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB195_1; ; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end @@ -9127,12 +9127,12 @@ define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB196_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB196_1; ; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end @@ -9172,12 +9172,12 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB197_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB197_1; ; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end @@ -9217,12 +9217,12 @@ define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB198_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB198_1; ; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end @@ -9262,12 +9262,12 @@ define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB199_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB199_1; ; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end @@ -9307,12 +9307,12 @@ define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB200_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB200_1; ; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end @@ -9352,12 +9352,12 @@ define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB201_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB201_1; ; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end @@ -9397,12 +9397,12 @@ define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB202_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB202_1; ; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end @@ -9442,12 +9442,12 @@ define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB203_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB203_1; ; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end @@ -9488,12 +9488,12 @@ define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB204_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB204_1; ; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end @@ -9534,12 +9534,12 @@ define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB205_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB205_1; ; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end @@ -9580,12 +9580,12 @@ define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB206_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB206_1; ; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end @@ -9626,12 +9626,12 @@ define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB207_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB207_1; ; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end @@ -9672,12 +9672,12 @@ define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB208_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB208_1; ; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end @@ -9718,12 +9718,12 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB209_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB209_1; ; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end @@ -9764,12 +9764,12 @@ define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB210_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB210_1; ; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end @@ -9810,12 +9810,12 @@ define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB211_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB211_1; ; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end @@ -9856,12 +9856,12 @@ define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB212_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB212_1; ; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end @@ -9902,12 +9902,12 @@ define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB213_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB213_1; ; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end @@ -9948,12 +9948,12 @@ define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB214_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB214_1; ; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end @@ -9994,12 +9994,12 @@ define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB215_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB215_1; ; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end @@ -10039,12 +10039,12 @@ define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB216_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB216_1; ; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end @@ -10084,12 +10084,12 @@ define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB217_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB217_1; ; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end @@ -10129,12 +10129,12 @@ define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB218_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB218_1; ; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end @@ -10174,12 +10174,12 @@ define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB219_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB219_1; ; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end @@ -10219,12 +10219,12 @@ define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB220_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB220_1; ; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end @@ -10264,12 +10264,12 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB221_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB221_1; ; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end @@ -10309,12 +10309,12 @@ define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB222_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB222_1; ; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end @@ -10354,12 +10354,12 @@ define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB223_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB223_1; ; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end @@ -10399,12 +10399,12 @@ define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB224_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB224_1; ; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end @@ -10444,12 +10444,12 @@ define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB225_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB225_1; ; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end @@ -10489,12 +10489,12 @@ define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB226_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB226_1; ; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end @@ -10534,12 +10534,12 @@ define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB227_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB227_1; ; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end @@ -10579,12 +10579,12 @@ define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB228_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB228_1; ; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end @@ -10624,12 +10624,12 @@ define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB229_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB229_1; ; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end @@ -10669,12 +10669,12 @@ define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB230_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB230_1; ; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end @@ -10714,12 +10714,12 @@ define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB231_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB231_1; ; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end @@ -10759,12 +10759,12 @@ define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB232_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB232_1; ; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end @@ -10804,12 +10804,12 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB233_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB233_1; ; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end @@ -10849,12 +10849,12 @@ define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB234_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB234_1; ; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end @@ -10894,12 +10894,12 @@ define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB235_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB235_1; ; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end @@ -10939,12 +10939,12 @@ define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB236_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB236_1; ; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end @@ -10984,12 +10984,12 @@ define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB237_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB237_1; ; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end @@ -11029,12 +11029,12 @@ define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB238_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB238_1; ; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end @@ -11074,12 +11074,12 @@ define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB239_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB239_1; ; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end @@ -11120,12 +11120,12 @@ define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB240_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB240_1; ; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end @@ -11166,12 +11166,12 @@ define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB241_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB241_1; ; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end @@ -11212,12 +11212,12 @@ define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB242_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB242_1; ; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end @@ -11258,12 +11258,12 @@ define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB243_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB243_1; ; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end @@ -11304,12 +11304,12 @@ define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB244_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB244_1; ; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end @@ -11350,12 +11350,12 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB245_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB245_1; ; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end @@ -11396,12 +11396,12 @@ define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB246_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB246_1; ; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end @@ -11442,12 +11442,12 @@ define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB247_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB247_1; ; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end @@ -11488,12 +11488,12 @@ define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB248_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB248_1; ; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end @@ -11534,12 +11534,12 @@ define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB249_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB249_1; ; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end @@ -11580,12 +11580,12 @@ define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB250_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB250_1; ; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end @@ -11626,12 +11626,12 @@ define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB251_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB251_1; ; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end @@ -11672,12 +11672,12 @@ define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB252_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB252_1; ; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end @@ -11717,12 +11717,12 @@ define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB253_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB253_1; ; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end @@ -11762,12 +11762,12 @@ define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB254_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB254_1; ; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end @@ -11807,12 +11807,12 @@ define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB255_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB255_1; ; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end @@ -11852,12 +11852,12 @@ define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB256_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB256_1; ; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end @@ -11897,12 +11897,12 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB257_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB257_1; ; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end @@ -11942,12 +11942,12 @@ define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB258_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB258_1; ; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end @@ -11987,12 +11987,12 @@ define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB259_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB259_1; ; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end @@ -12032,12 +12032,12 @@ define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB260_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB260_1; ; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end @@ -12077,12 +12077,12 @@ define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB261_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB261_1; ; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end @@ -12122,12 +12122,12 @@ define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB262_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB262_1; ; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end @@ -12167,12 +12167,12 @@ define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB263_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB263_1; ; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end @@ -12212,12 +12212,12 @@ define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB264_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB264_1; ; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end @@ -12258,12 +12258,12 @@ define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB265_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB265_1; ; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end @@ -12304,12 +12304,12 @@ define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB266_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB266_1; ; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end @@ -12350,12 +12350,12 @@ define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB267_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB267_1; ; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end @@ -12396,12 +12396,12 @@ define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB268_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB268_1; ; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end @@ -12442,12 +12442,12 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB269_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB269_1; ; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end @@ -12488,12 +12488,12 @@ define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB270_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB270_1; ; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end @@ -12534,12 +12534,12 @@ define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB271_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB271_1; ; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end @@ -12580,12 +12580,12 @@ define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB272_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB272_1; ; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end @@ -12626,12 +12626,12 @@ define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB273_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB273_1; ; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end @@ -12672,12 +12672,12 @@ define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB274_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB274_1; ; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end @@ -12718,12 +12718,12 @@ define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB275_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB275_1; ; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end @@ -12764,12 +12764,12 @@ define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB276_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB276_1; ; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end @@ -12810,12 +12810,12 @@ define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB277_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB277_1; ; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end @@ -12856,12 +12856,12 @@ define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB278_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB278_1; ; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end @@ -12902,12 +12902,12 @@ define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB279_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB279_1; ; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end @@ -12948,12 +12948,12 @@ define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB280_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB280_1; ; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end @@ -12994,12 +12994,12 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB281_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB281_1; ; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end @@ -13040,12 +13040,12 @@ define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB282_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB282_1; ; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end @@ -13086,12 +13086,12 @@ define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB283_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB283_1; ; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end @@ -13132,12 +13132,12 @@ define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB284_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB284_1; ; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end @@ -13178,12 +13178,12 @@ define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB285_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB285_1; ; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end @@ -13224,12 +13224,12 @@ define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB286_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB286_1; ; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end @@ -13270,12 +13270,12 @@ define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB287_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB287_1; ; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end @@ -13316,12 +13316,12 @@ define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB288_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB288_1; ; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end @@ -13362,12 +13362,12 @@ define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB289_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB289_1; ; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end @@ -13408,12 +13408,12 @@ define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB290_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB290_1; ; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end @@ -13454,12 +13454,12 @@ define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB291_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB291_1; ; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end @@ -13500,12 +13500,12 @@ define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB292_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB292_1; ; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end @@ -13546,12 +13546,12 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB293_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB293_1; ; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end @@ -13592,12 +13592,12 @@ define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB294_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB294_1; ; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end @@ -13638,12 +13638,12 @@ define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB295_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB295_1; ; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end @@ -13684,12 +13684,12 @@ define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB296_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB296_1; ; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end @@ -13730,12 +13730,12 @@ define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB297_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB297_1; ; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end @@ -13776,12 +13776,12 @@ define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB298_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB298_1; ; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end @@ -13822,12 +13822,12 @@ define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB299_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB299_1; ; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end @@ -13868,12 +13868,12 @@ define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB300_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB300_1; ; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end @@ -13914,12 +13914,12 @@ define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB301_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB301_1; ; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end @@ -13960,12 +13960,12 @@ define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB302_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB302_1; ; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end @@ -14006,12 +14006,12 @@ define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB303_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB303_1; ; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end @@ -14052,12 +14052,12 @@ define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB304_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB304_1; ; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end @@ -14098,12 +14098,12 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB305_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB305_1; ; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end @@ -14144,12 +14144,12 @@ define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB306_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB306_1; ; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end @@ -14190,12 +14190,12 @@ define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB307_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB307_1; ; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end @@ -14236,12 +14236,12 @@ define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB308_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB308_1; ; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end @@ -14282,12 +14282,12 @@ define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB309_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB309_1; ; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end @@ -14328,12 +14328,12 @@ define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB310_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB310_1; ; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end @@ -14374,12 +14374,12 @@ define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB311_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB311_1; ; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end @@ -14420,12 +14420,12 @@ define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB312_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB312_1; ; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end @@ -14466,12 +14466,12 @@ define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB313_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB313_1; ; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end @@ -14512,12 +14512,12 @@ define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB314_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB314_1; ; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end @@ -14558,12 +14558,12 @@ define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB315_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB315_1; ; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end @@ -14604,12 +14604,12 @@ define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB316_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB316_1; ; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end @@ -14650,12 +14650,12 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB317_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB317_1; ; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end @@ -14696,12 +14696,12 @@ define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB318_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB318_1; ; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end @@ -14742,12 +14742,12 @@ define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB319_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB319_1; ; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end @@ -14788,12 +14788,12 @@ define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB320_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB320_1; ; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end @@ -14834,12 +14834,12 @@ define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB321_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB321_1; ; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end @@ -14880,12 +14880,12 @@ define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB322_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB322_1; ; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end @@ -14926,12 +14926,12 @@ define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB323_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB323_1; ; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end @@ -14972,12 +14972,12 @@ define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB324_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB324_1; ; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end @@ -15018,12 +15018,12 @@ define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB325_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB325_1; ; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end @@ -15064,12 +15064,12 @@ define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB326_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB326_1; ; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end @@ -15110,12 +15110,12 @@ define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB327_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB327_1; ; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end @@ -15156,12 +15156,12 @@ define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB328_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB328_1; ; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end @@ -15202,12 +15202,12 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB329_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB329_1; ; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end @@ -15248,12 +15248,12 @@ define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB330_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB330_1; ; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end @@ -15294,12 +15294,12 @@ define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB331_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB331_1; ; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end @@ -15340,12 +15340,12 @@ define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB332_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB332_1; ; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end @@ -15386,12 +15386,12 @@ define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB333_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB333_1; ; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end @@ -15432,12 +15432,12 @@ define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cm ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB334_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB334_1; ; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end @@ -15478,12 +15478,12 @@ define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB335_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB335_1; ; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end @@ -15524,12 +15524,12 @@ define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB336_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB336_1; ; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end @@ -15570,12 +15570,12 @@ define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB337_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB337_1; ; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end @@ -15616,12 +15616,12 @@ define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB338_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB338_1; ; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end @@ -15662,12 +15662,12 @@ define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB339_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB339_1; ; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end @@ -15708,12 +15708,12 @@ define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB340_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB340_1; ; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end @@ -15754,12 +15754,12 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB341_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB341_1; ; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end @@ -15800,12 +15800,12 @@ define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB342_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB342_1; ; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end @@ -15846,12 +15846,12 @@ define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB343_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB343_1; ; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end @@ -15892,12 +15892,12 @@ define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB344_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB344_1; ; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end @@ -15938,12 +15938,12 @@ define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB345_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB345_1; ; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end @@ -15984,12 +15984,12 @@ define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB346_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB346_1; ; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end @@ -16030,12 +16030,12 @@ define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB347_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB347_1; ; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end @@ -16076,12 +16076,12 @@ define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB348_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB348_1; ; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end @@ -16122,12 +16122,12 @@ define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB349_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB349_1; ; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end @@ -16168,12 +16168,12 @@ define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB350_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB350_1; ; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end @@ -16214,12 +16214,12 @@ define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB351_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB351_1; ; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end @@ -16260,12 +16260,12 @@ define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB352_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB352_1; ; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end @@ -16306,12 +16306,12 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB353_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB353_1; ; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end @@ -16352,12 +16352,12 @@ define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB354_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB354_1; ; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end @@ -16398,12 +16398,12 @@ define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB355_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB355_1; ; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end @@ -16444,12 +16444,12 @@ define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB356_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB356_1; ; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end @@ -16490,12 +16490,12 @@ define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB357_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB357_1; ; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end @@ -16536,12 +16536,12 @@ define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB358_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB358_1; ; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end @@ -16582,12 +16582,12 @@ define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: or.b32 %r16, %r19, %r3; ; SM90-NEXT: or.b32 %r17, %r19, %r4; ; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB359_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; ; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB359_1; ; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index e087fcfe87917..237e42394ba2f 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r16, %r19, %r3; ; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index 70330d322decf..2841e6751d029 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -209,7 +209,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r39, %r48, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; ; CHECK-NEXT: setp.eq.b32 %p1, %r6, %r39; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 @@ -224,7 +224,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r41, %r49, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; ; CHECK-NEXT: setp.eq.b32 %p3, %r10, %r41; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 @@ -241,7 +241,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r43, %r50, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; ; CHECK-NEXT: setp.eq.b32 %p5, %r14, %r43; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 @@ -257,7 +257,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r45, %r51, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; ; CHECK-NEXT: setp.eq.b32 %p7, %r18, %r45; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 @@ -274,7 +274,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r47, %r52, %r3; -; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; ; CHECK-NEXT: setp.eq.b32 %p9, %r22, %r47; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure From e007be20be079f28f2fbc1def59cd91aae0e5661 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Thu, 10 Jul 2025 21:31:51 +0000 Subject: [PATCH 22/26] remove unnecessary multiclass --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d6dd076340019..53dd3e2785aed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -80,26 +80,6 @@ def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsiste def Ordering_volatile : PatLeaf<(i32 8)>; // Volatile = 8 def Ordering_relaxed_mmio : PatLeaf<(i32 9)>; // RelaxedMMIO = 9 -multiclass nvvm_ternary_atomic_op_scoped { - defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val); - def NAME#_cta: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ - return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Block; - }]>; - def NAME#_cluster : PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ - return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Cluster; - }]>; - def NAME#_gpu: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ - return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::Device; - }]>; - def NAME#_sys: PatFrag(NAME) node:$ptr, node:$cmp, node:$val), [{ - return Scopes[cast(N)->getSyncScopeID()] == NVPTX::Scope::System; - }]>; -} - // A node that will be replaced with the current PTX version. class PTX { From 8a79f5da8a38efe89ff9f28b0a25825cde293dfd Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 11 Jul 2025 21:45:18 +0000 Subject: [PATCH 23/26] only emit one slice of tests --- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 22354 +-------------------- llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 16694 +--------------- llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 22365 +--------------------- llvm/test/CodeGen/NVPTX/cmpxchg.py | 102 +- 4 files changed, 2748 insertions(+), 58767 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 655ee851c4083..63c389c36e87e 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %} -define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -70,13 +70,13 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: monotonic_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -160,13 +163,13 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,14 +179,15 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -191,10 +195,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -211,7 +215,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -221,14 +225,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acquire_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -236,10 +241,11 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -256,7 +262,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -266,14 +272,15 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -281,10 +288,11 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -313,12 +321,12 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -326,10 +334,11 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -346,7 +355,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -356,14 +365,15 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: release_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -371,10 +381,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -385,13 +396,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -401,14 +412,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -416,10 +428,11 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -430,13 +443,13 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -446,14 +459,15 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -461,10 +475,11 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -475,13 +490,13 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -491,14 +506,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -506,10 +522,11 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -520,13 +537,13 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -536,14 +553,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_monotonic_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -551,10 +569,11 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -565,13 +584,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -581,15 +600,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_acquire_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -597,10 +616,11 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -611,13 +631,13 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -627,15 +647,15 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: mov.b32 %r20, %r8; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; @@ -643,10 +663,11 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -657,13 +678,13 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: and.b32 %r15, %r14, 255; ; SM60-NEXT: shl.b32 %r3, %r15, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 ; SM60-NEXT: or.b32 %r17, %r20, %r3; ; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -676,22033 +697,1426 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_generic_gpu( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_sys( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: monotonic_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_cta( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_global_gpu( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acquire_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_sys( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_cta( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_acquire_i8_shared_gpu( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: release_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_sys( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB25_3; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_cta( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_monotonic_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB27_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_acquire_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_sys( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<20>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: mov.b32 %r12, 65535; ; SM60-NEXT: shl.b32 %r13, %r12, %r1; ; SM60-NEXT: not.b32 %r2, %r13; ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; +; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r16, %r19, %r3; +; SM60-NEXT: or.b32 %r17, %r19, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM60-NEXT: mov.b32 %r19, %r8; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; +; SM60-NEXT: membar.cta; ; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_cta( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB30_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB30_1; -; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_global_gpu( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB31_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB31_1; -; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: monotonic_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB32_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB32_1; -; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB33_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB33_1; -; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB34_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB34_1; -; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB35_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB35_1; -; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB36_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB36_1; -; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB37_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB37_1; -; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB38_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB38_1; -; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB39_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB39_1; -; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB40_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB40_1; -; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB41_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB41_1; -; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB42_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB42_1; -; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB43_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB43_1; -; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB44_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB44_1; -; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB45_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB45_1; -; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB46_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB46_1; -; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_monotonic_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB47_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB47_1; -; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB48_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB48_1; -; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB49_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB49_1; -; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB50_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB50_1; -; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB51_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB51_1; -; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB52_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB52_1; -; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB53_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB53_1; -; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB54_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB54_1; -; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB55_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB55_1; -; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB56_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB56_1; -; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB57_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB57_1; -; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB58_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB58_1; -; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_acquire_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB59_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB59_1; -; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB60_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB60_1; -; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB61_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB61_1; -; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB62_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB62_1; -; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB63_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB63_1; -; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB64_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB64_1; -; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB65_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB65_1; -; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB66_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB66_1; -; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB67_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB67_1; -; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB68_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB68_1; -; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB69_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB69_1; -; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB70_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB70_1; -; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acquire_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB71_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB71_1; -; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB72_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB72_1; -; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB73_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB73_1; -; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB74_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB74_1; -; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB75_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB75_1; -; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB76_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB76_1; -; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB77_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB77_1; -; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB78_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB78_1; -; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB79_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB79_1; -; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB80_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB80_1; -; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB81_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB81_1; -; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB82_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB82_1; -; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_monotonic_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB83_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB83_1; -; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB84_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB84_1; -; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB85_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB85_1; -; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB86_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB86_1; -; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB87_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB87_1; -; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB88_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB88_1; -; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB89_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB89_1; -; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB90_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB90_1; -; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB91_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB91_1; -; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB92_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB92_1; -; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB93_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB93_1; -; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB94_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB94_1; -; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_acquire_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB95_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB95_1; -; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB96_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB96_1; -; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB97_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB97_1; -; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB98_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB98_1; -; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB99_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB99_1; -; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB100_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB100_1; -; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB101_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB101_1; -; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB102_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB102_1; -; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB103_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB103_1; -; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB104_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB104_1; -; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB105_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB105_1; -; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB106_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB106_1; -; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: release_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB107_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB107_1; -; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB108_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB108_1; -; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB109_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB109_1; -; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB110_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB110_1; -; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB111_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB111_1; -; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB112_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB112_1; -; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB113_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB113_1; -; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB114_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB114_1; -; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB115_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB115_1; -; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB116_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB116_1; -; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB117_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB117_1; -; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB118_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB118_1; -; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB119_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB119_1; -; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB120_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB120_1; -; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB121_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB121_1; -; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB122_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB122_1; -; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB123_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB123_1; -; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB124_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB124_1; -; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB125_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB125_1; -; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB126_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB126_1; -; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB127_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB127_1; -; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB128_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB128_1; -; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB129_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB129_1; -; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB130_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB130_1; -; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_acquire_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB131_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB131_1; -; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB132_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB132_1; -; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB133_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB133_1; -; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB134_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB134_1; -; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB135_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB135_1; -; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB136_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB136_1; -; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB137_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB137_1; -; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB138_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB138_1; -; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB139_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB139_1; -; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB140_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB140_1; -; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB141_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB141_1; -; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB142_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB142_1; -; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB143_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB143_1; -; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB144_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB144_1; -; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB145_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB145_1; -; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB146_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB146_1; -; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB147_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB147_1; -; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB148_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB148_1; -; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB149_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB149_1; -; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB150_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB150_1; -; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB151_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB151_1; -; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB152_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB152_1; -; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB153_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB153_1; -; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB154_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB154_1; -; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB155_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB155_1; -; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB156_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB156_1; -; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB157_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB157_1; -; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB158_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB158_1; -; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB159_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB159_1; -; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB160_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB160_1; -; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB161_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB161_1; -; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB162_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB162_1; -; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB163_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB163_1; -; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB164_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB164_1; -; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB165_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB165_1; -; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB166_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB166_1; -; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB167_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB167_1; -; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB168_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB168_1; -; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB169_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB169_1; -; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB170_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB170_1; -; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB171_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB171_1; -; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB172_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB172_1; -; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB173_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB173_1; -; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB174_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB174_1; -; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB175_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB175_1; -; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB176_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB176_1; -; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB177_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB177_1; -; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB178_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB178_1; -; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; -; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM60-NEXT: @%p1 bra $L__BB179_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; -; SM60-NEXT: @%p2 bra $L__BB179_1; -; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB180_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB180_1; -; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB181_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB181_1; -; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB182_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB182_1; -; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB183_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB183_1; -; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB184_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB184_1; -; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB185_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB185_1; -; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB186_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB186_1; -; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB187_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB187_1; -; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB188_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB188_1; -; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB189_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB189_1; -; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB190_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB190_1; -; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB191_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB191_1; -; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB192_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB192_1; -; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB193_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB193_1; -; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB194_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB194_1; -; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB195_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB195_1; -; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB196_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB196_1; -; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB197_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB197_1; -; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB198_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB198_1; -; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB199_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB199_1; -; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB200_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB200_1; -; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB201_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB201_1; -; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB202_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB202_1; -; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB203_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB203_1; -; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB204_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB204_1; -; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB205_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB205_1; -; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB206_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB206_1; -; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB207_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB207_1; -; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB208_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB208_1; -; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB209_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB209_1; -; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB210_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB210_1; -; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB211_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB211_1; -; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB212_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB212_1; -; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB213_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB213_1; -; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB214_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB214_1; -; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB215_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB215_1; -; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB216_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB216_1; -; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB217_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB217_1; -; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB218_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB218_1; -; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB219_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB219_1; -; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB220_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB220_1; -; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB221_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB221_1; -; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB222_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB222_1; -; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB223_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB223_1; -; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB224_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB224_1; -; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB225_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB225_1; -; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB226_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB226_1; -; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB227_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB227_1; -; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB228_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB228_1; -; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB229_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB229_1; -; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB230_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB230_1; -; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB231_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB231_1; -; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB232_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB232_1; -; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB233_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB233_1; -; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB234_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB234_1; -; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB235_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB235_1; -; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB236_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB236_1; -; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB237_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB237_1; -; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB238_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB238_1; -; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB239_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB239_1; -; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB240_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB240_1; -; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB241_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB241_1; -; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB242_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB242_1; -; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB243_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB243_1; -; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB244_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB244_1; -; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB245_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB245_1; -; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB246_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB246_1; -; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB247_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB247_1; -; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB248_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB248_1; -; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB249_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB249_1; -; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB250_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB250_1; -; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acquire_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB251_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB251_1; -; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB252_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB252_1; -; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB253_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB253_1; -; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB254_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB254_1; -; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB255_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB255_1; -; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB256_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB256_1; -; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB257_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB257_1; -; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB258_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB258_1; -; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB259_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB259_1; -; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB260_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB260_1; -; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB261_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB261_1; -; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB262_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB262_1; -; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB263_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB263_1; -; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB264_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB264_1; -; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB265_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB265_1; -; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB266_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB266_1; -; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB267_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB267_1; -; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB268_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB268_1; -; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB269_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB269_1; -; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB270_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB270_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB270_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB270_1; -; SM60-NEXT: $L__BB270_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB271_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB271_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB271_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB271_1; -; SM60-NEXT: $L__BB271_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB272_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB272_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB272_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB272_1; -; SM60-NEXT: $L__BB272_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB273_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB273_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB273_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB273_1; -; SM60-NEXT: $L__BB273_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB274_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB274_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB274_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB274_1; -; SM60-NEXT: $L__BB274_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB275_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB275_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB275_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB275_1; -; SM60-NEXT: $L__BB275_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB276_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB276_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB276_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB276_1; -; SM60-NEXT: $L__BB276_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB277_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB277_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB277_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB277_1; -; SM60-NEXT: $L__BB277_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB278_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB278_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB278_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB278_1; -; SM60-NEXT: $L__BB278_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB279_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB279_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB279_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB279_1; -; SM60-NEXT: $L__BB279_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB280_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB280_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB280_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB280_1; -; SM60-NEXT: $L__BB280_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB281_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB281_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB281_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB281_1; -; SM60-NEXT: $L__BB281_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB282_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB282_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB282_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB282_1; -; SM60-NEXT: $L__BB282_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB283_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB283_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB283_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB283_1; -; SM60-NEXT: $L__BB283_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB284_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB284_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB284_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB284_1; -; SM60-NEXT: $L__BB284_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB285_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB285_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB285_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB285_1; -; SM60-NEXT: $L__BB285_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB286_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB286_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB286_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB286_1; -; SM60-NEXT: $L__BB286_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: release_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB287_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB287_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB287_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB287_1; -; SM60-NEXT: $L__BB287_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB288_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB288_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB288_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB288_1; -; SM60-NEXT: $L__BB288_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB289_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB289_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB289_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB289_1; -; SM60-NEXT: $L__BB289_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB290_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB290_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB290_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB290_1; -; SM60-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB291_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB291_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB291_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB291_1; -; SM60-NEXT: $L__BB291_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB292_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB292_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB292_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB292_1; -; SM60-NEXT: $L__BB292_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB293_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB293_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB293_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB293_1; -; SM60-NEXT: $L__BB293_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB294_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB294_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB294_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB294_1; -; SM60-NEXT: $L__BB294_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB295_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB295_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB295_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB295_1; -; SM60-NEXT: $L__BB295_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB296_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB296_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB296_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB296_1; -; SM60-NEXT: $L__BB296_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB297_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB297_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB297_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB297_1; -; SM60-NEXT: $L__BB297_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB298_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB298_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB298_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB298_1; -; SM60-NEXT: $L__BB298_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB299_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB299_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB299_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB299_1; -; SM60-NEXT: $L__BB299_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB300_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB300_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB300_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB300_1; -; SM60-NEXT: $L__BB300_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB301_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB301_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB301_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB301_1; -; SM60-NEXT: $L__BB301_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB302_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB302_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB302_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB302_1; -; SM60-NEXT: $L__BB302_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB303_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB303_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB303_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB303_1; -; SM60-NEXT: $L__BB303_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB304_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB304_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB304_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB304_1; -; SM60-NEXT: $L__BB304_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB305_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB305_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB305_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB305_1; -; SM60-NEXT: $L__BB305_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB306_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB306_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB306_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB306_1; -; SM60-NEXT: $L__BB306_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB307_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB307_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB307_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB307_1; -; SM60-NEXT: $L__BB307_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB308_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB308_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB308_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB308_1; -; SM60-NEXT: $L__BB308_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB309_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB309_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB309_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB309_1; -; SM60-NEXT: $L__BB309_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB310_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB310_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB310_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB310_1; -; SM60-NEXT: $L__BB310_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB311_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB311_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB311_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB311_1; -; SM60-NEXT: $L__BB311_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB312_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB312_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB312_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB312_1; -; SM60-NEXT: $L__BB312_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB313_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB313_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB313_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB313_1; -; SM60-NEXT: $L__BB313_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB314_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB314_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB314_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB314_1; -; SM60-NEXT: $L__BB314_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB315_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB315_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB315_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB315_1; -; SM60-NEXT: $L__BB315_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB316_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB316_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB316_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB316_1; -; SM60-NEXT: $L__BB316_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB317_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB317_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB317_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB317_1; -; SM60-NEXT: $L__BB317_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB318_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB318_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB318_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB318_1; -; SM60-NEXT: $L__BB318_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB319_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB319_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB319_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB319_1; -; SM60-NEXT: $L__BB319_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB320_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB320_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB320_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB320_1; -; SM60-NEXT: $L__BB320_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB321_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB321_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB321_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB321_1; -; SM60-NEXT: $L__BB321_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB322_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB322_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB322_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB322_1; -; SM60-NEXT: $L__BB322_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB323_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB323_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB323_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB323_1; -; SM60-NEXT: $L__BB323_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB324_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB324_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB324_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB324_1; -; SM60-NEXT: $L__BB324_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB325_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB325_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB325_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB325_1; -; SM60-NEXT: $L__BB325_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB326_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB326_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB326_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB326_1; -; SM60-NEXT: $L__BB326_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB327_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB327_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB327_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB327_1; -; SM60-NEXT: $L__BB327_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB328_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB328_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB328_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB328_1; -; SM60-NEXT: $L__BB328_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB329_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB329_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB329_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB329_1; -; SM60-NEXT: $L__BB329_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB330_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB330_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB330_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB330_1; -; SM60-NEXT: $L__BB330_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB331_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB331_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB331_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB331_1; -; SM60-NEXT: $L__BB331_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB332_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB332_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB332_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB332_1; -; SM60-NEXT: $L__BB332_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB333_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB333_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB333_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB333_1; -; SM60-NEXT: $L__BB333_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB334_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB334_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB334_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB334_1; -; SM60-NEXT: $L__BB334_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB335_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB335_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB335_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB335_1; -; SM60-NEXT: $L__BB335_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB336_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB336_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB336_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB336_1; -; SM60-NEXT: $L__BB336_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB337_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB337_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB337_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB337_1; -; SM60-NEXT: $L__BB337_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB338_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB338_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB338_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB338_1; -; SM60-NEXT: $L__BB338_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB339_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB339_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB339_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB339_1; -; SM60-NEXT: $L__BB339_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB340_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB340_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB340_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB340_1; -; SM60-NEXT: $L__BB340_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB341_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB341_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB341_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB341_1; -; SM60-NEXT: $L__BB341_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB342_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB342_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB342_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB342_1; -; SM60-NEXT: $L__BB342_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB343_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB343_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB343_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB343_1; -; SM60-NEXT: $L__BB343_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB344_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB344_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB344_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB344_1; -; SM60-NEXT: $L__BB344_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB345_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB345_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB345_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB345_1; -; SM60-NEXT: $L__BB345_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB346_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB346_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB346_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB346_1; -; SM60-NEXT: $L__BB346_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_acquire_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB347_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB347_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB347_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB347_1; -; SM60-NEXT: $L__BB347_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB348_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB348_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB348_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB348_1; -; SM60-NEXT: $L__BB348_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB349_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB349_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB349_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB349_1; -; SM60-NEXT: $L__BB349_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB350_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB350_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB350_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB350_1; -; SM60-NEXT: $L__BB350_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB351_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB351_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB351_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB351_1; -; SM60-NEXT: $L__BB351_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB352_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB352_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB352_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB352_1; -; SM60-NEXT: $L__BB352_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB353_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB353_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB353_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB353_1; -; SM60-NEXT: $L__BB353_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB354_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB354_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB354_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB354_1; -; SM60-NEXT: $L__BB354_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB355_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB355_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB355_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB355_1; -; SM60-NEXT: $L__BB355_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB356_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB356_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB356_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB356_1; -; SM60-NEXT: $L__BB356_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB357_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB357_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB357_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB357_1; -; SM60-NEXT: $L__BB357_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB358_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB358_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB358_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB358_1; -; SM60-NEXT: $L__BB358_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .pred %p<3>; -; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; -; SM60-NEXT: .reg .b64 %rd<3>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; -; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; -; SM60-NEXT: $L__BB359_1: // %partword.cmpxchg.loop -; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM60-NEXT: @%p1 bra $L__BB359_3; -; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM60-NEXT: // in Loop: Header=BB359_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; -; SM60-NEXT: @%p2 bra $L__BB359_1; -; SM60-NEXT: $L__BB359_3: // %partword.cmpxchg.end -; SM60-NEXT: membar.gl; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_monotonic_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_acquire_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_monotonic_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_acquire_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acquire_seq_cst_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_monotonic_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_acquire_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: release_seq_cst_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_acquire_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_acquire_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b32 %r<4>; -; SM60-NEXT: .reg .b64 %rd<2>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM60-NEXT: st.param.b32 [func_retval0], %r2; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_monotonic_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_acquire_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_monotonic_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_acquire_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acquire_seq_cst_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_monotonic_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_acquire_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: release_seq_cst_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_acquire_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; -; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic( -; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; -; SM60-EMPTY: -; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic_sys( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic_cta( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acquire_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global_sys( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global_cta( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: release_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_global_gpu( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared_sys( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared_cta( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_monotonic_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_acquire_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic_sys( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic_cta( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_generic_gpu( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: monotonic_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global_sys( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global_cta( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acquire_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_global_gpu( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared_sys( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: release_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared_cta( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_acquire_i64_shared_gpu( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_monotonic_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; -; SM60-NEXT: atom.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_acquire_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; -; SM60-NEXT: atom.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM60: { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; -; SM60-NEXT: atom.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_global( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB60_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB60_1; +; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.sys; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global_sys( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; -; SM60-NEXT: atom.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global_cta( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_sys( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; -; SM60-NEXT: atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM60-NEXT: atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_global_gpu( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; -; SM60-NEXT: atom.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM60-NEXT: atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB64_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB64_1; +; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM60-LABEL: acq_rel_acquire_i8_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .pred %p<3>; +; SM60-NEXT: .reg .b16 %rs<2>; +; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; -; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; -; SM60-NEXT: atom.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM60-NEXT: membar.cta; +; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: and.b64 %rd1, %rd2, -4; +; SM60-NEXT: cvt.u32.u64 %r10, %rd2; +; SM60-NEXT: and.b32 %r11, %r10, 3; +; SM60-NEXT: shl.b32 %r1, %r11, 3; +; SM60-NEXT: mov.b32 %r12, 255; +; SM60-NEXT: shl.b32 %r13, %r12, %r1; +; SM60-NEXT: not.b32 %r2, %r13; +; SM60-NEXT: cvt.u32.u16 %r14, %rs1; +; SM60-NEXT: and.b32 %r15, %r14, 255; +; SM60-NEXT: shl.b32 %r3, %r15, %r1; +; SM60-NEXT: shl.b32 %r4, %r9, %r1; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM60-NEXT: // =>This Inner Loop Header: Depth=1 +; SM60-NEXT: or.b32 %r17, %r20, %r3; +; SM60-NEXT: or.b32 %r18, %r20, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: @%p1 bra $L__BB65_3; +; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM60-NEXT: and.b32 %r8, %r7, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: @%p2 bra $L__BB65_1; +; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM60-NEXT: membar.cta; +; SM60-NEXT: st.param.b32 [func_retval0], %r14; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta( +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_generic_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; -; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; -; SM60-NEXT: atom.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM60-NEXT: atom.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu( +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM60-LABEL: acq_rel_acquire_i32_shared_cta( ; SM60: { -; SM60-NEXT: .reg .b64 %rd<5>; +; SM60-NEXT: .reg .b32 %r<4>; +; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; -; SM60-NEXT: membar.gl; -; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; -; SM60-NEXT: atom.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM60-NEXT: st.param.b64 [func_retval0], %rd3; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM60-NEXT: atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 07b2f2f8fa9b0..5cb344d5ded84 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -70,13 +70,13 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: monotonic_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -166,7 +169,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,14 +179,15 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -191,10 +195,10 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -221,14 +225,15 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acquire_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -236,10 +241,11 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -256,7 +262,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -266,14 +272,15 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -281,10 +288,11 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -295,13 +303,13 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -313,12 +321,12 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -326,10 +334,11 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -340,13 +349,13 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -356,14 +365,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: release_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -371,10 +381,11 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -385,13 +396,13 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -401,14 +412,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -416,10 +428,11 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -430,13 +443,13 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -446,15 +459,15 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -462,10 +475,11 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -476,13 +490,13 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -495,12 +509,12 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_generic_gpu( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -508,10 +522,11 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -522,13 +537,13 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: and.b32 %r15, %r14, 255; ; SM70-NEXT: shl.b32 %r3, %r15, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -538,15 +553,15 @@ define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_sys( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_monotonic_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -554,10 +569,11 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -574,7 +590,7 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -584,15 +600,15 @@ define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_cta( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_acquire_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -600,10 +616,11 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -633,12 +650,12 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_global_gpu( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; @@ -646,10 +663,11 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -666,7 +684,7 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 ; SM70-NEXT: or.b32 %r17, %r20, %r3; ; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -676,16272 +694,1186 @@ define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: mov.b32 %r20, %r8; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_sys( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_cta( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_acquire_i8_shared_gpu( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: monotonic_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_sys( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_cta( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acquire_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_sys( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_cta( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_global_gpu( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: release_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_sys( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_cta( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB25_3; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_sys( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_monotonic_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_cta( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_acquire_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_generic_gpu( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<20>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: mov.b32 %r12, 65535; ; SM70-NEXT: shl.b32 %r13, %r12, %r1; ; SM70-NEXT: not.b32 %r2, %r13; ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; +; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r16, %r19, %r3; +; SM70-NEXT: or.b32 %r17, %r19, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM70-NEXT: mov.b32 %r19, %r8; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: fence.acq_rel.cta; ; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_sys( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB30_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB30_1; -; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_cta( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB31_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB31_1; -; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_global_gpu( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: monotonic_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB32_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB32_1; -; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB33_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB33_1; -; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB34_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB34_1; -; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_monotonic_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB35_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB35_1; -; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB36_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB36_1; -; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB37_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB37_1; -; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB38_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB38_1; -; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB39_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB39_1; -; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB40_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB40_1; -; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB41_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB41_1; -; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB42_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB42_1; -; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB43_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB43_1; -; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_acquire_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB44_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB44_1; -; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB45_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB45_1; -; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB46_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB46_1; -; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB47_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB47_1; -; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB48_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB48_1; -; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB49_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB49_1; -; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB50_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB50_1; -; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB51_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB51_1; -; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB52_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB52_1; -; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acquire_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB53_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB53_1; -; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB54_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB54_1; -; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB55_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB55_1; -; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB56_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB56_1; -; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB57_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB57_1; -; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB58_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB58_1; -; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB59_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB59_1; -; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB60_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB60_1; -; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB61_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB61_1; -; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_monotonic_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB62_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB62_1; -; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB63_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB63_1; -; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB64_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB64_1; -; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB65_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB65_1; -; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB66_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB66_1; -; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB67_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB67_1; -; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB68_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB68_1; -; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB69_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB69_1; -; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB70_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB70_1; -; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_acquire_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB71_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB71_1; -; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB72_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB72_1; -; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB73_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB73_1; -; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB74_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB74_1; -; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB75_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB75_1; -; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB76_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB76_1; -; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB77_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB77_1; -; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB78_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB78_1; -; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB79_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB79_1; -; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: release_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB80_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB80_1; -; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB81_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB81_1; -; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB82_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB82_1; -; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB83_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB83_1; -; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB84_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB84_1; -; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB85_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB85_1; -; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB86_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB86_1; -; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB87_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB87_1; -; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB88_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB88_1; -; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB89_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB89_1; -; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB90_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB90_1; -; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB91_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB91_1; -; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB92_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB92_1; -; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB93_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB93_1; -; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB94_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB94_1; -; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB95_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB95_1; -; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB96_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB96_1; -; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB97_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB97_1; -; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_acquire_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB98_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB98_1; -; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB99_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB99_1; -; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB100_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB100_1; -; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB101_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB101_1; -; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB102_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB102_1; -; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB103_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB103_1; -; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB104_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB104_1; -; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB105_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB105_1; -; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB106_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB106_1; -; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB107_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB107_1; -; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB108_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB108_1; -; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB109_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB109_1; -; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB110_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB110_1; -; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB111_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB111_1; -; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB112_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB112_1; -; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB113_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB113_1; -; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB114_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB114_1; -; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB115_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB115_1; -; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB116_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB116_1; -; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB117_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB117_1; -; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB118_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB118_1; -; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB119_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB119_1; -; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB120_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB120_1; -; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB121_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB121_1; -; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB122_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB122_1; -; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB123_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB123_1; -; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB124_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB124_1; -; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB125_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB125_1; -; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB126_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB126_1; -; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB127_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB127_1; -; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB128_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB128_1; -; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB129_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB129_1; -; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB130_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB130_1; -; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB131_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB131_1; -; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB132_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB132_1; -; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB133_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB133_1; -; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; -; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM70-NEXT: @%p1 bra $L__BB134_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; -; SM70-NEXT: @%p2 bra $L__BB134_1; -; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB135_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB135_1; -; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB136_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB136_1; -; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB137_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB137_1; -; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB138_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB138_1; -; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB139_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB139_1; -; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB140_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB140_1; -; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB141_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB141_1; -; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB142_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB142_1; -; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB143_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB143_1; -; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB144_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB144_1; -; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB145_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB145_1; -; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB146_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB146_1; -; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB147_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB147_1; -; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB148_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB148_1; -; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB149_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB149_1; -; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB150_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB150_1; -; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB151_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB151_1; -; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB152_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB152_1; -; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB153_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB153_1; -; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB154_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB154_1; -; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB155_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB155_1; -; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB156_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB156_1; -; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB157_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB157_1; -; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB158_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB158_1; -; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB159_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB159_1; -; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB160_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB160_1; -; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB161_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB161_1; -; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB162_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB162_1; -; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB163_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB163_1; -; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB164_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB164_1; -; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB165_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB165_1; -; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB166_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB166_1; -; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB167_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB167_1; -; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB168_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB168_1; -; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB169_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB169_1; -; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB170_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB170_1; -; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB171_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB171_1; -; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB172_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB172_1; -; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB173_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB173_1; -; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB174_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB174_1; -; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB175_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB175_1; -; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB176_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB176_1; -; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB177_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB177_1; -; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB178_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB178_1; -; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB179_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB179_1; -; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB180_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB180_1; -; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB181_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB181_1; -; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB182_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB182_1; -; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB183_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB183_1; -; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB184_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB184_1; -; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB185_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB185_1; -; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB186_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB186_1; -; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB187_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB187_1; -; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acquire_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB188_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB188_1; -; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB189_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB189_1; -; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB190_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB190_1; -; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB191_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB191_1; -; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB192_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB192_1; -; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB193_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB193_1; -; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB194_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB194_1; -; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB195_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB195_1; -; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB196_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB196_1; -; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB197_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB197_1; -; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB198_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB198_1; -; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB199_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB199_1; -; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB200_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB200_1; -; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB201_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB201_1; -; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB202_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB202_1; -; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB203_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB203_1; -; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB204_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB204_1; -; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB205_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB205_1; -; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB206_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB206_1; -; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB207_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB207_1; -; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB208_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB208_1; -; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB209_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB209_1; -; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB210_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB210_1; -; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB211_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB211_1; -; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB212_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB212_1; -; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB213_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB213_1; -; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB214_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB214_1; -; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: release_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB215_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB215_1; -; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB216_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB216_1; -; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB217_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB217_1; -; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB218_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB218_1; -; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB219_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB219_1; -; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB220_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB220_1; -; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB221_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB221_1; -; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB222_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB222_1; -; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB223_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB223_1; -; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB224_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB224_1; -; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB225_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB225_1; -; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB226_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB226_1; -; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB227_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB227_1; -; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB228_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB228_1; -; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB229_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB229_1; -; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB230_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB230_1; -; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB231_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB231_1; -; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB232_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB232_1; -; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB233_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB233_1; -; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB234_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB234_1; -; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB235_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB235_1; -; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB236_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB236_1; -; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB237_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB237_1; -; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB238_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB238_1; -; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB239_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB239_1; -; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB240_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB240_1; -; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB241_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB241_1; -; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB242_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB242_1; -; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB243_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB243_1; -; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB244_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB244_1; -; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB245_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB245_1; -; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB246_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB246_1; -; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB247_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB247_1; -; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB248_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB248_1; -; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB249_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB249_1; -; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB250_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB250_1; -; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB251_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB251_1; -; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB252_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB252_1; -; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB253_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB253_1; -; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB254_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB254_1; -; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB255_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB255_1; -; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB256_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB256_1; -; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB257_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB257_1; -; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB258_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB258_1; -; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB259_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB259_1; -; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_acquire_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB260_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB260_1; -; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB261_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB261_1; -; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB262_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB262_1; -; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB263_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB263_1; -; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB264_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB264_1; -; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB265_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB265_1; -; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB266_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB266_1; -; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB267_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB267_1; -; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB268_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB268_1; -; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .pred %p<3>; -; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; -; SM70-NEXT: .reg .b64 %rd<3>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; -; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; -; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop -; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM70-NEXT: @%p1 bra $L__BB269_3; -; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; -; SM70-NEXT: @%p2 bra $L__BB269_1; -; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end -; SM70-NEXT: fence.acq_rel.gpu; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_monotonic_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_acquire_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_monotonic_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_acquire_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acquire_seq_cst_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; -; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; -; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; -; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; -; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; -; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_monotonic_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_acquire_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: release_seq_cst_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_acquire_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_acquire_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b32 %r<4>; -; SM70-NEXT: .reg .b64 %rd<2>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM70-NEXT: st.param.b32 [func_retval0], %r2; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; -; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_monotonic_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_acquire_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_monotonic_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_acquire_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acquire_seq_cst_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; -; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; -; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; -; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; -; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; -; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; -; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_monotonic_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_acquire_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: release_seq_cst_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_global_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_acquire_i64_shared_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global_sys( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; -; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( -; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; -; SM70-EMPTY: -; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acquire_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic_sys( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic_cta( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: release_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global_sys( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global_cta( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_global_gpu( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_monotonic_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared_sys( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_acquire_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared_cta( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic_sys( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic_cta( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: monotonic_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_generic_gpu( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global_sys( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global_cta( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acquire_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_global_gpu( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared_sys( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared_cta( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: release_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_acquire_i64_shared_gpu( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_monotonic_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global_sys( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM70-LABEL: seq_cst_acquire_i64_global_cta( ; SM70: { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } @@ -16962,71 +1894,229 @@ define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 ret i64 %new } -define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_global( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB60_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB60_1; +; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; -; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; -; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_sys( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; -; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; -; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_global_gpu( ; SM70: { -; SM70-NEXT: .reg .b64 %rd<5>; +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; -; SM70-NEXT: fence.sc.gpu; -; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; -; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM70-NEXT: st.param.b64 [func_retval0], %rd3; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new +} + +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_generic_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB64_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB64_1; +; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM70-LABEL: acq_rel_acquire_i8_shared_cta( +; SM70: { +; SM70-NEXT: .reg .pred %p<3>; +; SM70-NEXT: .reg .b16 %rs<2>; +; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b64 %rd<3>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: and.b64 %rd1, %rd2, -4; +; SM70-NEXT: cvt.u32.u64 %r10, %rd2; +; SM70-NEXT: and.b32 %r11, %r10, 3; +; SM70-NEXT: shl.b32 %r1, %r11, 3; +; SM70-NEXT: mov.b32 %r12, 255; +; SM70-NEXT: shl.b32 %r13, %r12, %r1; +; SM70-NEXT: not.b32 %r2, %r13; +; SM70-NEXT: cvt.u32.u16 %r14, %rs1; +; SM70-NEXT: and.b32 %r15, %r14, 255; +; SM70-NEXT: shl.b32 %r3, %r15, %r1; +; SM70-NEXT: shl.b32 %r4, %r9, %r1; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM70-NEXT: // =>This Inner Loop Header: Depth=1 +; SM70-NEXT: or.b32 %r17, %r20, %r3; +; SM70-NEXT: or.b32 %r18, %r20, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: @%p1 bra $L__BB65_3; +; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM70-NEXT: and.b32 %r8, %r7, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: @%p2 bra $L__BB65_1; +; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new +} + +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_generic_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new +} + +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM70-LABEL: acq_rel_acquire_i32_shared_cta( +; SM70: { +; SM70-NEXT: .reg .b32 %r<4>; +; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM70-NEXT: st.param.b32 [func_retval0], %r2; +; SM70-NEXT: ret; + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index b05a404b6be7e..7cb259023d6dd 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} -define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_sys( +define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cta( +define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -70,13 +70,13 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_cluster( +define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: monotonic_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_generic_gpu( +define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -146,10 +149,10 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -160,13 +163,13 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -176,14 +179,15 @@ define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_sys( +define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -191,10 +195,10 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -211,7 +215,7 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -221,14 +225,15 @@ define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cta( +define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acquire_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -236,10 +241,11 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -266,14 +272,15 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_cluster( +define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -281,10 +288,11 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -301,7 +309,7 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -313,12 +321,12 @@ define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_global_gpu( +define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -326,10 +334,11 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -346,7 +355,7 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -356,14 +365,15 @@ define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_sys( +define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: release_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -371,10 +381,11 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -385,13 +396,13 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -401,14 +412,15 @@ define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cta( +define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -416,10 +428,11 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -430,13 +443,13 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -446,14 +459,15 @@ define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_cluster( +define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -461,10 +475,11 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -475,13 +490,13 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -491,14 +506,15 @@ define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new } -define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_monotonic_i8_shared_gpu( +define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -506,10 +522,11 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -520,13 +537,13 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -536,14 +553,15 @@ define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_sys( +define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_monotonic_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -551,10 +569,11 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_sys_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -565,13 +584,13 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -581,15 +600,15 @@ define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cta( +define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_acquire_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -597,10 +616,11 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -611,13 +631,13 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -630,12 +650,12 @@ define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new } -define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_cluster( +define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; @@ -643,10 +663,11 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -657,13 +678,13 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: and.b32 %r15, %r14, 255; ; SM90-NEXT: shl.b32 %r3, %r15, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 ; SM90-NEXT: or.b32 %r17, %r20, %r3; ; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; ; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure @@ -673,22036 +694,1446 @@ define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: mov.b32 %r20, %r8; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new } -define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_generic_gpu( +define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_sys( +define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cta( +define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: monotonic_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB17_1; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst + ret i16 %new } -define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_cluster( +define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_global_gpu( +define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_sys( +define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acquire_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_sys_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_sys_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cta( +define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_cluster( +define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_cluster_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire + ret i16 %new } -define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_acquire_i8_shared_gpu( +define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: release_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_shared_gpu_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_sys( +define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_sys_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cta( +define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB25_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: @%p1 bra $L__BB25_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster( +define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_cluster_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu( +define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_monotonic_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_gpu_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_sys( +define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_acquire_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_sys_param_1]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cta( +define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { +; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<20>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: mov.b32 %r12, 65535; ; SM90-NEXT: shl.b32 %r13, %r12, %r1; ; SM90-NEXT: not.b32 %r2, %r13; ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; +; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r16, %r19, %r3; +; SM90-NEXT: or.b32 %r17, %r19, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 ; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; +; SM90-NEXT: mov.b32 %r19, %r8; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; ; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst + ret i16 %new } -define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_cluster( +define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB30_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB30_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB30_1; -; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic + ret i32 %new } -define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_global_gpu( +define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB31_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB31_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB31_1; -; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire + ret i32 %new } -define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_sys( +define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: monotonic_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB32_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB32_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB32_1; -; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB33_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB33_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB33_1; -; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB34_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB34_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB34_1; -; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst - ret i8 %new -} - -define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB35_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB35_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB35_1; -; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB36_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB36_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB36_1; -; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB37_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB37_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB37_1; -; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB38_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB38_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB38_1; -; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB39_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB39_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB39_1; -; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB40_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB40_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB40_1; -; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB41_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB41_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB41_1; -; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB42_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB42_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB42_1; -; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB43_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB43_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB43_1; -; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB44_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB44_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB44_1; -; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB45_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB45_1; -; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB46_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB46_1; -; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic - ret i8 %new -} - -define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_monotonic_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB47_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB47_1; -; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB48_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB48_1; -; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB49_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB49_1; -; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB50_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB50_1; -; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB51_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB51_1; -; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB52_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB52_1; -; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB53_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB53_1; -; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB54_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB54_1; -; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB55_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB55_1; -; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB56_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB56_1; -; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB57_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB57_1; -; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB58_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB58_1; -; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire - ret i8 %new -} - -define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_acquire_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB59_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB59_1; -; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB60_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB60_1; -; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB61_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB61_1; -; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB62_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB62_1; -; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB63_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB63_1; -; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB64_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB64_1; -; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB65_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB65_1; -; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB66_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB66_1; -; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB67_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB67_1; -; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB68_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB68_1; -; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB69_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB69_1; -; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB70_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB70_1; -; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst - ret i8 %new -} - -define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acquire_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB71_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB71_1; -; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB72_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB72_1; -; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB73_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB73_1; -; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB74_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB74_1; -; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB75_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB75_1; -; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB76_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB76_1; -; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB77_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB77_1; -; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB78_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB78_1; -; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB79_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB79_1; -; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB80_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB80_1; -; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB81_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB81_1; -; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB82_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB82_1; -; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic - ret i8 %new -} - -define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_monotonic_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB83_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB83_1; -; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic - ret i8 %new -} - -define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB84_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB84_1; -; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB85_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB85_1; -; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB86_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB86_1; -; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB87_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB87_1; -; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB88_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB88_1; -; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB89_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB89_1; -; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB90_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB90_1; -; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB91_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB91_1; -; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB92_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB92_1; -; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB93_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB93_1; -; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB94_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB94_1; -; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire - ret i8 %new -} - -define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_acquire_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB95_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB95_1; -; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB96_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB96_1; -; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB97_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB97_1; -; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB98_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB98_1; -; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB99_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB99_1; -; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB100_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB100_1; -; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB101_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB101_1; -; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB102_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB102_1; -; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB103_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB103_1; -; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB104_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB104_1; -; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB105_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB105_1; -; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB106_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB106_1; -; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst - ret i8 %new -} - -define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: release_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB107_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB107_1; -; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB108_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB108_1; -; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB109_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB109_1; -; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB110_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB110_1; -; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB111_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB111_1; -; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB112_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB112_1; -; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB113_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB113_1; -; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB114_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB114_1; -; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB115_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB115_1; -; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB116_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB116_1; -; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB117_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB117_1; -; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB118_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB118_1; -; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB119_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB119_1; -; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB120_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB120_1; -; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB121_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB121_1; -; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB122_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB122_1; -; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB123_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB123_1; -; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB124_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB124_1; -; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB125_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB125_1; -; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB126_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB126_1; -; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB127_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB127_1; -; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB128_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB128_1; -; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB129_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB129_1; -; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB130_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB130_1; -; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_acquire_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB131_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB131_1; -; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB132_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB132_1; -; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB133_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB133_1; -; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB134_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB134_1; -; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB135_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB135_1; -; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB136_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB136_1; -; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB137_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB137_1; -; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB138_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB138_1; -; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB139_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB139_1; -; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB140_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB140_1; -; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB141_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB141_1; -; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB142_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB142_1; -; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst - ret i8 %new -} - -define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB143_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB143_1; -; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB144_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB144_1; -; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB145_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB145_1; -; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB146_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB146_1; -; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB147_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB147_1; -; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB148_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB148_1; -; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB149_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB149_1; -; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB150_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB150_1; -; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB151_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB151_1; -; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB152_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB152_1; -; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB153_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB153_1; -; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB154_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB154_1; -; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB155_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB155_1; -; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB156_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB156_1; -; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB157_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB157_1; -; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB158_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB158_1; -; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB159_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB159_1; -; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB160_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB160_1; -; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB161_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB161_1; -; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB162_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB162_1; -; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB163_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB163_1; -; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB164_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB164_1; -; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB165_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB165_1; -; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB166_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB166_1; -; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_acquire_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB167_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB167_1; -; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB168_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB168_1; -; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB169_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB169_1; -; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB170_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB170_1; -; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB171_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB171_1; -; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB172_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB172_1; -; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB173_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB173_1; -; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB174_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB174_1; -; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB175_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB175_1; -; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB176_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB176_1; -; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB177_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB177_1; -; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB178_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB178_1; -; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst - ret i8 %new -} - -define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { -; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; -; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; -; SM90-NEXT: @%p1 bra $L__BB179_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; -; SM90-NEXT: @%p2 bra $L__BB179_1; -; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst - ret i8 %new -} - -define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB180_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB180_1; -; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB181_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB181_1; -; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB182_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB182_1; -; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB183_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB183_1; -; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB184_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB184_1; -; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB185_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB185_1; -; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB186_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB186_1; -; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB187_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB187_1; -; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB188_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB188_1; -; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB189_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB189_1; -; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB190_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB190_1; -; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_monotonic_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB191_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB191_1; -; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB192_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB192_1; -; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB193_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB193_1; -; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB194_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB194_1; -; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB195_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB195_1; -; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB196_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB196_1; -; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB197_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB197_1; -; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB198_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB198_1; -; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB199_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB199_1; -; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB200_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB200_1; -; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB201_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB201_1; -; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB202_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB202_1; -; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_acquire_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB203_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB203_1; -; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB204_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB204_1; -; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB205_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB205_1; -; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB206_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB206_1; -; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB207_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB207_1; -; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB208_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB208_1; -; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB209_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB209_1; -; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB210_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB210_1; -; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB211_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB211_1; -; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB212_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB212_1; -; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB213_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB213_1; -; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB214_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB214_1; -; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst - ret i16 %new -} - -define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB215_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB215_1; -; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB216_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB216_1; -; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB217_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB217_1; -; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB218_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB218_1; -; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB219_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB219_1; -; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB220_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB220_1; -; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB221_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB221_1; -; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB222_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB222_1; -; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB223_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB223_1; -; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB224_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB224_1; -; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB225_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB225_1; -; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB226_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB226_1; -; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic - ret i16 %new -} - -define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_monotonic_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB227_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB227_1; -; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB228_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB228_1; -; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB229_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB229_1; -; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB230_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB230_1; -; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB231_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB231_1; -; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB232_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB232_1; -; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB233_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB233_1; -; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB234_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB234_1; -; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB235_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB235_1; -; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_sys_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_sys_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB236_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB236_1; -; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cta_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB237_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB237_1; -; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_cluster_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_cluster_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB238_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB238_1; -; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire - ret i16 %new -} - -define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_acquire_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_gpu_param_0]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_gpu_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB239_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB239_1; -; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB240_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB240_1; -; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB241_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB241_1; -; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB242_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB242_1; -; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB243_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB243_1; -; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB244_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB244_1; -; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB245_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB245_1; -; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB246_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB246_1; -; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB247_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB247_1; -; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB248_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB248_1; -; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB249_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB249_1; -; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB250_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB250_1; -; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst - ret i16 %new -} - -define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acquire_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB251_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB251_1; -; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB252_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB252_1; -; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB253_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB253_1; -; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB254_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB254_1; -; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB255_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB255_1; -; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB256_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB256_1; -; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB257_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB257_1; -; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB258_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB258_1; -; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB259_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB259_1; -; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB260_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB260_1; -; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB261_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB261_1; -; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB262_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB262_1; -; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic - ret i16 %new -} - -define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_monotonic_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB263_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB263_1; -; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic - ret i16 %new -} - -define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB264_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB264_1; -; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB265_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB265_1; -; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB266_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB266_1; -; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB267_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB267_1; -; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB268_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB268_1; -; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB269_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB269_1; -; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB270_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB270_1; -; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB271_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB271_1; -; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB272_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB272_1; -; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB273_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB273_1; -; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB274_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB274_1; -; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire - ret i16 %new -} - -define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_acquire_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB275_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB275_1; -; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB276_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB276_1; -; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB277_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB277_1; -; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB278_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB278_1; -; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB279_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB279_1; -; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB280_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB280_1; -; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB281_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB281_1; -; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB282_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB282_1; -; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB283_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB283_1; -; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB284_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB284_1; -; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB285_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB285_1; -; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB286_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB286_1; -; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst - ret i16 %new -} - -define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: release_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB287_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB287_1; -; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB288_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB288_1; -; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB289_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB289_1; -; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB290_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB290_1; -; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB291_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB291_1; -; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB292_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB292_1; -; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB293_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB293_1; -; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB294_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB294_1; -; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB295_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB295_1; -; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB296_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB296_1; -; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB297_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB297_1; -; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB298_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB298_1; -; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB299_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB299_1; -; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB300_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB300_1; -; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB301_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB301_1; -; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB302_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB302_1; -; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB303_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB303_1; -; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB304_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB304_1; -; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB305_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB305_1; -; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB306_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB306_1; -; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB307_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB307_1; -; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0]; -; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB308_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB308_1; -; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0]; -; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB309_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB309_1; -; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.release.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB310_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB310_1; -; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_acquire_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.release.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB311_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB311_1; -; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB312_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB312_1; -; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB313_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB313_1; -; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB314_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB314_1; -; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB315_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB315_1; -; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB316_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB316_1; -; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB317_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB317_1; -; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB318_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB318_1; -; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB319_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB319_1; -; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB320_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB320_1; -; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB321_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB321_1; -; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB322_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB322_1; -; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst - ret i16 %new -} - -define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB323_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB323_1; -; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB324_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB324_1; -; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB325_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB325_1; -; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB326_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB326_1; -; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB327_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB327_1; -; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB328_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB328_1; -; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB329_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB329_1; -; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB330_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB330_1; -; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB331_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB331_1; -; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB332_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB332_1; -; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB333_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB333_1; -; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB334_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB334_1; -; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB335_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB335_1; -; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB336_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB336_1; -; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB337_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB337_1; -; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB338_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB338_1; -; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB339_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB339_1; -; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB340_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB340_1; -; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB341_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB341_1; -; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB342_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB342_1; -; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB343_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB343_1; -; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB344_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB344_1; -; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB345_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB345_1; -; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB346_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB346_1; -; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_acquire_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB347_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB347_1; -; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB348_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB348_1; -; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB349_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB349_1; -; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB350_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB350_1; -; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB351_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB351_1; -; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB352_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB352_1; -; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB353_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB353_1; -; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB354_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB354_1; -; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB355_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB355_1; -; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB356_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB356_1; -; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB357_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB357_1; -; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB358_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB358_1; -; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.cluster; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst - ret i16 %new -} - -define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) { -; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .pred %p<3>; -; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; -; SM90-NEXT: .reg .b64 %rd<3>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2]; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1]; -; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; -; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop -; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; -; SM90-NEXT: @%p1 bra $L__BB359_3; -; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure -; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; -; SM90-NEXT: @%p2 bra $L__BB359_1; -; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end -; SM90-NEXT: fence.acquire.gpu; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst - ret i16 %new -} - -define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_monotonic_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_acquire_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst - ret i32 %new -} - -define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic - ret i32 %new -} - -define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_monotonic_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire - ret i32 %new -} - -define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_acquire_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst - ret i32 %new -} - -define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acquire_seq_cst_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_sys_param_2]; -; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cta_param_2]; -; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_sys_param_2]; -; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; -; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_sys_param_2]; -; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cta_param_2]; -; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic - ret i32 %new -} - -define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_monotonic_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic - ret i32 %new -} - -define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire - ret i32 %new -} - -define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_acquire_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst - ret i32 %new -} - -define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: release_seq_cst_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_acquire_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst - ret i32 %new -} - -define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_acquire_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst - ret i32 %new -} - -define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { -; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b32 %r<4>; -; SM90-NEXT: .reg .b64 %rd<2>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2; -; SM90-NEXT: st.param.b32 [func_retval0], %r2; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst - ret i32 %new -} - -define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2]; -; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2]; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_monotonic_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_acquire_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst - ret i64 %new -} - -define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic - ret i64 %new -} - -define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_monotonic_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire - ret i64 %new -} - -define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_acquire_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst - ret i64 %new -} - -define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acquire_seq_cst_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_sys_param_2]; -; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cta_param_2]; -; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_sys_param_2]; -; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; -; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_sys_param_2]; -; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cta_param_2]; -; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic - ret i64 %new -} - -define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_monotonic_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic - ret i64 %new -} - -define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire - ret i64 %new -} - -define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_acquire_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst - ret i64 %new -} - -define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: release_seq_cst_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_acquire_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; -; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst - ret i64 %new -} - -define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu( -; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; -; SM90-EMPTY: -; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic_sys( +define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic_cta( +define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster( +define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acquire_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu( +define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global_sys( +define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global_cta( +define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: release_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global_cluster( +define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_global_gpu( +define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared_sys( +define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared_cta( +define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_monotonic_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster( +define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_acquire_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire + ret i32 %new } -define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu( +define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: seq_cst_seq_cst_i32_global_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst + ret i32 %new } -define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic_sys( +define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic_cta( +define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic_cluster( +define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: monotonic_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_generic_gpu( +define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global_sys( +define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global_cta( +define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acquire_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2]; ; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global_cluster( +define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_global_gpu( +define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared_sys( +define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: release_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared_cta( +define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared_cluster( +define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire ret i64 %new } -define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_acquire_i64_shared_gpu( +define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: acq_rel_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys( +define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_monotonic_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta( +define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_acquire_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster( +define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { +; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( ; SM90: { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; +; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst + %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst ret i64 %new } -define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu( +define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_global( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB60_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB60_1; +; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global_sys( +define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global_cta( +define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_sys( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2]; +; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster( +define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_cluster( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2]; +; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu( +define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_global_gpu( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2]; +; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys( +define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0]; -; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2]; -; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB65_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB65_1; +; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta( +define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { +; SM90-LABEL: acq_rel_acquire_i8_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .pred %p<3>; +; SM90-NEXT: .reg .b16 %rs<2>; +; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0]; -; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2]; -; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: and.b64 %rd1, %rd2, -4; +; SM90-NEXT: cvt.u32.u64 %r10, %rd2; +; SM90-NEXT: and.b32 %r11, %r10, 3; +; SM90-NEXT: shl.b32 %r1, %r11, 3; +; SM90-NEXT: mov.b32 %r12, 255; +; SM90-NEXT: shl.b32 %r13, %r12, %r1; +; SM90-NEXT: not.b32 %r2, %r13; +; SM90-NEXT: cvt.u32.u16 %r14, %rs1; +; SM90-NEXT: and.b32 %r15, %r14, 255; +; SM90-NEXT: shl.b32 %r3, %r15, %r1; +; SM90-NEXT: shl.b32 %r4, %r9, %r1; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; +; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop +; SM90-NEXT: // =>This Inner Loop Header: Depth=1 +; SM90-NEXT: or.b32 %r17, %r20, %r3; +; SM90-NEXT: or.b32 %r18, %r20, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; +; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: @%p1 bra $L__BB66_3; +; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure +; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 +; SM90-NEXT: and.b32 %r8, %r7, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; +; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: @%p2 bra $L__BB66_1; +; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: st.param.b32 [func_retval0], %r14; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire + ret i8 %new } -define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster( +define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_generic_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0]; -; SM90-NEXT: fence.sc.cluster; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2]; -; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } -define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) { -; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu( +define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) { +; SM90-LABEL: acq_rel_acquire_i32_shared_cta( ; SM90: { -; SM90-NEXT: .reg .b64 %rd<5>; +; SM90-NEXT: .reg .b32 %r<4>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0]; -; SM90-NEXT: fence.sc.gpu; -; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1]; -; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2]; -; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; -; SM90-NEXT: st.param.b64 [func_retval0], %rd3; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2]; +; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2; +; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; - %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst - ret i64 %new + %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire + ret i32 %new } diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index 367fc885c0f8c..907a0ec536058 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -26,6 +26,12 @@ """ ) +def get_addrspace_cast(addrspace): + if addrspace == 0: + return "" + else: + return " addrspace({})".format(str(addrspace)) + TESTS = [(60, 50), (70, 63), (90, 87)] LLVM_SCOPES = ["", "block", "cluster", "device"] @@ -42,47 +48,87 @@ ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} + + if __name__ == "__main__": for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) - for size, success, failure, addrspace, llvm_scope in product( - SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES + + # Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES + # This is very large, so we instead test 3 slices. + + # First slice: are all orderings correctly supported, with and without emulation loops? + # set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes + addrspace, llvm_scope = 1, "block" + for size, success, failure in product( + SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS ): - # cluster ordering is supported from SM90 onwards - if sm != 90 and llvm_scope == "cluster": + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) + + # Second slice: Are all scopes correctlly supported, with and without emulation loops? + # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32 + addrspace, success, failure = 1, "acq_rel", "acquire" + for size in [8, 32]: + print( + cmpxchg_func_no_scope.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), + ), + file=fp, + ) + + for llvm_scope in LLVM_SCOPES: + if sm < 90 and llvm_scope == "cluster": + continue + if llvm_scope == "block": + # skip (acq_rel, acquire, global, cta) continue - if addrspace == 0: - addrspace_cast = "" - else: - addrspace_cast = " addrspace({})".format(str(addrspace)) - # Test default scope print( - cmpxchg_func_no_scope.substitute( + cmpxchg_func.substitute( success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], - addrspace_cast=addrspace_cast, + addrspace_cast=get_addrspace_cast(addrspace), + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ), + file=fp, + ) + + # Third slice: Are all address spaces correctly supported? + # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32 + success, failure, llvm_scope = "acq_rel", "acquire", "block" + for size, addrspace in product( + [8, 32], ADDRSPACES + ): + if addrspace == 1: + # skip (acq_rel, acquire, global, cta) + continue + print( + cmpxchg_func.substitute( + success=success, + failure=failure, + size=size, + addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], + addrspace_cast=get_addrspace_cast(addrspace), llvm_scope=llvm_scope, ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], ), file=fp, ) - - for llvm_scope in LLVM_SCOPES: - # cluster ordering is supported from SM90 onwards - if sm < 90 and llvm_scope == "cluster": - continue - print( - cmpxchg_func.substitute( - success=success, - failure=failure, - size=size, - addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], - addrspace_cast=addrspace_cast, - llvm_scope=llvm_scope, - ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], - ), - file=fp, - ) From d76cb8b43387c997b45b65629b6fc121b3f3e048 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Fri, 11 Jul 2025 21:51:52 +0000 Subject: [PATCH 24/26] black --- llvm/test/CodeGen/NVPTX/cmpxchg.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py index 907a0ec536058..75623a59ad481 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.py +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py @@ -26,12 +26,14 @@ """ ) + def get_addrspace_cast(addrspace): if addrspace == 0: return "" else: return " addrspace({})".format(str(addrspace)) + TESTS = [(60, 50), (70, 63), (90, 87)] LLVM_SCOPES = ["", "block", "cluster", "device"] @@ -49,7 +51,6 @@ def get_addrspace_cast(addrspace): ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"} - if __name__ == "__main__": for sm, ptx in TESTS: with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp: @@ -64,7 +65,7 @@ def get_addrspace_cast(addrspace): for size, success, failure in product( SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS ): - print( + print( cmpxchg_func.substitute( success=success, failure=failure, @@ -81,7 +82,7 @@ def get_addrspace_cast(addrspace): # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32 addrspace, success, failure = 1, "acq_rel", "acquire" for size in [8, 32]: - print( + print( cmpxchg_func_no_scope.substitute( success=success, failure=failure, @@ -110,13 +111,11 @@ def get_addrspace_cast(addrspace): ), file=fp, ) - - # Third slice: Are all address spaces correctly supported? - # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32 + + # Third slice: Are all address spaces correctly supported? + # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32 success, failure, llvm_scope = "acq_rel", "acquire", "block" - for size, addrspace in product( - [8, 32], ADDRSPACES - ): + for size, addrspace in product([8, 32], ADDRSPACES): if addrspace == 1: # skip (acq_rel, acquire, global, cta) continue From 676b68424be257c6ffd88ad9ec15070c5a23f73d Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Mon, 14 Jul 2025 23:58:57 +0000 Subject: [PATCH 25/26] address review comments --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 1 + llvm/lib/Target/NVPTX/NVPTX.h | 3 ++- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 20 ++++++++++++------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 ++ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index af134f079ee91..edc8e33559d97 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -307,6 +307,7 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, auto S = NVPTX::Scope(Imm); switch (S) { case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: return; case NVPTX::Scope::System: O << ".sys"; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 15997bc3878d8..7b42537b04466 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -174,7 +174,8 @@ enum Scope : ScopeUnderlyingType { Cluster = 2, Device = 3, System = 4, - LASTSCOPE = System + DefaultDevice = 5, + LASTSCOPE = DefaultDevice }; using AddressSpaceUnderlyingType = unsigned int; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 1f165ff119246..cf55023c888fa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -520,8 +520,8 @@ NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { auto Ordering = N->getMergedOrdering(); switch (Ordering) { case AtomicOrdering::NotAtomic: - case AtomicOrdering::Unordered: return NVPTX::Ordering::NotAtomic; + case AtomicOrdering::Unordered: case AtomicOrdering::Monotonic: return NVPTX::Ordering::Relaxed; case AtomicOrdering::Acquire: @@ -533,12 +533,14 @@ NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { case AtomicOrdering::SequentiallyConsistent: return NVPTX::Ordering::SequentiallyConsistent; } + llvm_unreachable("Invalid atomic ordering"); } NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { // No "scope" modifier for SM/PTX versions which do not support scoped atomics + // Functionally, these atomics are at device scope if (!Subtarget->hasAtomScope()) - return NVPTX::Scope::Thread; + return NVPTX::Scope::DefaultDevice; return Scopes[N->getSyncScopeID()]; } @@ -778,7 +780,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, } static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, - unsigned CodeAddrSpace) { + NVPTX::AddressSpace CodeAddrSpace) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address // space. return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global && @@ -810,6 +812,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -829,6 +832,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -848,6 +852,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -868,6 +873,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.", ScopeToString(S))); } @@ -1046,7 +1052,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { const MVT LoadedVT = LoadedEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1118,7 +1124,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1334,7 +1340,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; // Address Space Setting - const unsigned CodeAddrSpace = getAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); SDLoc DL(ST); SDValue Chain = ST->getChain(); @@ -1384,7 +1390,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { assert(StoreVT.isSimple() && "Store value is not simple"); // Address Space Setting - const unsigned CodeAddrSpace = getAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index e792e441e49e6..8843326a7d748 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -149,6 +149,8 @@ inline std::string ScopeToString(Scope S) { return "Cluster"; case Scope::Device: return "Device"; + case Scope::DefaultDevice: + return "DefaultDevice"; } report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".", static_cast(S))); From 9bfa008e68d5f16b97a6de3ad94efdcf90713ad1 Mon Sep 17 00:00:00 2001 From: Akshay Deodhar Date: Tue, 15 Jul 2025 12:51:56 -0700 Subject: [PATCH 26/26] Update llvm/lib/Target/NVPTX/NVPTX.h Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com> --- llvm/lib/Target/NVPTX/NVPTX.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 7b42537b04466..180ce4ab02a27 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -174,7 +174,7 @@ enum Scope : ScopeUnderlyingType { Cluster = 2, Device = 3, System = 4, - DefaultDevice = 5, + DefaultDevice = 5, // For SM < 70: denotes PTX op implicit/default .gpu scope LASTSCOPE = DefaultDevice };